library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.3
## ✓ tibble  3.0.1     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tree)
## Registered S3 method overwritten by 'tree':
##   method     from
##   print.tree cli
options(warn = 1)
set.seed(1)
carseats.df = read.csv("/Users/shahrdadshadab/env/my-R-project/ISLR/Data/datasets/Carseats.csv", 
                      header=T, stringsAsFactors = F, na.strings = "?")

carseats.df <- tibble(carseats.df)


# find NAs

carseats.df %>% 
   filter_all(all_vars(is.na(.)))
# convert all character columns to factor
char.to.fctor <- function(df)
  df %>%
    mutate_if(is.character, ~ factor(.x, levels = (.x %>% table() %>% names())))

carseats.df <- char.to.fctor(carseats.df)

# Add new column High as label for all records
carseats.df$High <- ifelse(carseats.df$Sales <= 8, "No", "Yes")

# convert High column into factor
carseats.df$High <- ordered(carseats.df$High, levels = c("Yes","No"))
str(carseats.df)
## tibble [400 × 12] (S3: tbl_df/tbl/data.frame)
##  $ Sales      : num [1:400] 9.5 11.22 10.06 7.4 4.15 ...
##  $ CompPrice  : int [1:400] 138 111 113 117 141 124 115 136 132 132 ...
##  $ Income     : int [1:400] 73 48 35 100 64 113 105 81 110 113 ...
##  $ Advertising: int [1:400] 11 16 10 4 3 13 0 15 0 0 ...
##  $ Population : int [1:400] 276 260 269 466 340 501 45 425 108 131 ...
##  $ Price      : int [1:400] 120 83 80 97 128 72 108 120 124 124 ...
##  $ ShelveLoc  : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
##  $ Age        : int [1:400] 42 65 59 55 38 78 71 67 76 76 ...
##  $ Education  : int [1:400] 17 10 12 14 13 16 15 10 10 17 ...
##  $ Urban      : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
##  $ US         : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
##  $ High       : Ord.factor w/ 2 levels "Yes"<"No": 1 1 1 2 2 1 2 1 2 2 ...
# construct a classification tree on data 
tree.carseats <- tree(High ~ .-Sales, carseats.df)

summary(tree.carseats)
## 
## Classification tree:
## tree(formula = High ~ . - Sales, data = carseats.df)
## Variables actually used in tree construction:
## [1] "ShelveLoc"   "Price"       "US"          "Income"      "CompPrice"  
## [6] "Population"  "Advertising" "Age"        
## Number of terminal nodes:  27 
## Residual mean deviance:  0.4575 = 170.7 / 373 
## Misclassification error rate: 0.09 = 36 / 400
plot(tree.carseats)
text(tree.carseats, pretty=0)

tree.carseats
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 400 541.500 No ( 0.41000 0.59000 )  
##     2) ShelveLoc: Good 85  90.330 Yes ( 0.77647 0.22353 )  
##       4) Price < 135 68  49.260 Yes ( 0.88235 0.11765 )  
##         8) US: No 17  22.070 Yes ( 0.64706 0.35294 )  
##          16) Price < 109 8   0.000 Yes ( 1.00000 0.00000 ) *
##          17) Price > 109 9  11.460 No ( 0.33333 0.66667 ) *
##         9) US: Yes 51  16.880 Yes ( 0.96078 0.03922 ) *
##       5) Price > 135 17  22.070 No ( 0.35294 0.64706 )  
##        10) Income < 46 6   0.000 No ( 0.00000 1.00000 ) *
##        11) Income > 46 11  15.160 Yes ( 0.54545 0.45455 ) *
##     3) ShelveLoc: Bad,Medium 315 390.600 No ( 0.31111 0.68889 )  
##       6) Price < 92.5 46  56.530 Yes ( 0.69565 0.30435 )  
##        12) Income < 57 10  12.220 No ( 0.30000 0.70000 )  
##          24) CompPrice < 110.5 5   0.000 No ( 0.00000 1.00000 ) *
##          25) CompPrice > 110.5 5   6.730 Yes ( 0.60000 0.40000 ) *
##        13) Income > 57 36  35.470 Yes ( 0.80556 0.19444 )  
##          26) Population < 207.5 16  21.170 Yes ( 0.62500 0.37500 ) *
##          27) Population > 207.5 20   7.941 Yes ( 0.95000 0.05000 ) *
##       7) Price > 92.5 269 299.800 No ( 0.24535 0.75465 )  
##        14) Advertising < 13.5 224 213.200 No ( 0.18304 0.81696 )  
##          28) CompPrice < 124.5 96  44.890 No ( 0.06250 0.93750 )  
##            56) Price < 106.5 38  33.150 No ( 0.15789 0.84211 )  
##             112) Population < 177 12  16.300 No ( 0.41667 0.58333 )  
##               224) Income < 60.5 6   0.000 No ( 0.00000 1.00000 ) *
##               225) Income > 60.5 6   5.407 Yes ( 0.83333 0.16667 ) *
##             113) Population > 177 26   8.477 No ( 0.03846 0.96154 ) *
##            57) Price > 106.5 58   0.000 No ( 0.00000 1.00000 ) *
##          29) CompPrice > 124.5 128 150.200 No ( 0.27344 0.72656 )  
##            58) Price < 122.5 51  70.680 Yes ( 0.50980 0.49020 )  
##             116) ShelveLoc: Bad 11   6.702 No ( 0.09091 0.90909 ) *
##             117) ShelveLoc: Medium 40  52.930 Yes ( 0.62500 0.37500 )  
##               234) Price < 109.5 16   7.481 Yes ( 0.93750 0.06250 ) *
##               235) Price > 109.5 24  32.600 No ( 0.41667 0.58333 )  
##                 470) Age < 49.5 13  16.050 Yes ( 0.69231 0.30769 ) *
##                 471) Age > 49.5 11   6.702 No ( 0.09091 0.90909 ) *
##            59) Price > 122.5 77  55.540 No ( 0.11688 0.88312 )  
##             118) CompPrice < 147.5 58  17.400 No ( 0.03448 0.96552 ) *
##             119) CompPrice > 147.5 19  25.010 No ( 0.36842 0.63158 )  
##               238) Price < 147 12  16.300 Yes ( 0.58333 0.41667 )  
##                 476) CompPrice < 152.5 7   5.742 Yes ( 0.85714 0.14286 ) *
##                 477) CompPrice > 152.5 5   5.004 No ( 0.20000 0.80000 ) *
##               239) Price > 147 7   0.000 No ( 0.00000 1.00000 ) *
##        15) Advertising > 13.5 45  61.830 Yes ( 0.55556 0.44444 )  
##          30) Age < 54.5 25  25.020 Yes ( 0.80000 0.20000 )  
##            60) CompPrice < 130.5 14  18.250 Yes ( 0.64286 0.35714 )  
##             120) Income < 100 9  12.370 No ( 0.44444 0.55556 ) *
##             121) Income > 100 5   0.000 Yes ( 1.00000 0.00000 ) *
##            61) CompPrice > 130.5 11   0.000 Yes ( 1.00000 0.00000 ) *
##          31) Age > 54.5 20  22.490 No ( 0.25000 0.75000 )  
##            62) CompPrice < 122.5 10   0.000 No ( 0.00000 1.00000 ) *
##            63) CompPrice > 122.5 10  13.860 No ( 0.50000 0.50000 )  
##             126) Price < 125 5   0.000 Yes ( 1.00000 0.00000 ) *
##             127) Price > 125 5   0.000 No ( 0.00000 1.00000 ) *

Tree Deviance is defined as :\(-2\sum_{m=1}^\overline{T_{0}} \sum_{k=1}^K n_{m,k} \ln \widehat{p}_{m,k}\)

Residual mean devince is: \(\frac{-2\sum_{m=1}^\overline{T_{0}} \sum_{k=1}^K n_{m,k} \ln \widehat{p}_{m,k}}{N - |T_{0}|}\)

Where N is Number of observations.

Note that values for each branch are as follows:

ShelveLoc: Good \((split\,criterion)\) 85 \((No\,of\,observation\,in\,the\,branch)\) 90.330 \((deviance)\) Yes \((overall\,prediction\,for\,the\,branch)\) (0.77647 0.22353 ) \((fractions\,of\,observation\, classifications\,for\,the\,branch)\)

library(tidyverse)
library(tree)
options(warn = 1)
set.seed(1)
carseats.df = read.csv("/Users/shahrdadshadab/env/my-R-project/ISLR/Data/datasets/Carseats.csv", 
                      header=T, stringsAsFactors = F, na.strings = "?")

carseats.df <- tibble(carseats.df)


# find NAs

carseats.df %>% 
   filter_all(all_vars(is.na(.))) 
# convert all character columns to factor
char.to.fctor <- function(df)
  df %>%
    mutate_if(is.character, ~ factor(.x, levels = (.x %>% table() %>% names())))

carseats.df <- char.to.fctor(carseats.df)

# Add new column High as label for all records
carseats.df$High <- ifelse(carseats.df$Sales <= 8, "No", "Yes")

# convert High column into factor
carseats.df$High <- factor(carseats.df$High, levels = c("Yes","No"))

set.seed(1113)

no.of.train <- ceiling(0.7 * nrow(carseats.df))
no.of.test <- nrow(carseats.df) - no.of.train

train.idx <- sample(seq_len(nrow(carseats.df)), no.of.train)
test.idx <- setdiff(seq_len(nrow(carseats.df)), train.idx)

# build classification tree on train data
tree.carseats <- tree(High ~ .-Sales, carseats.df[train.idx,])

plot(tree.carseats)
text(tree.carseats, pretty = 0)

# do the predoiction on test data (type="class" makes R return class predictions):
pred.probs <- predict(tree.carseats,carseats.df[test.idx,])
tree.pred <- predict(tree.carseats, carseats.df[test.idx,], type = "class")

# see the confusion matrix
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
(confusion_table <- table(tree.pred, carseats.df[test.idx,]$High))
##          
## tree.pred Yes No
##       Yes  33  8
##       No   22 57
nullClassifier <- max(
    (confusion_table[1,1] + confusion_table[2,1])/(confusion_table[1,1] + confusion_table[2,1]+ confusion_table[1,2] + confusion_table[2,2] ), 
    (confusion_table[1,2] + confusion_table[2,2])/(confusion_table[1,1] + confusion_table[2,1]+ confusion_table[1,2] + confusion_table[2,2] ))
  
roc_obj <- roc(unlist(carseats.df[test.idx,]$High), pred.probs[, 1])
## Setting levels: control = Yes, case = No
## Setting direction: controls > cases
# Let's draw some AUC plots
plot(roc_obj, legacy.axes = TRUE)

missclassificationRate <- mean(tree.pred != carseats.df[test.idx,]$High)
FP_rates <- confusion_table[2,1]/(confusion_table[2,1]+ confusion_table[1,1])
TP_rates <- confusion_table[2,2]/(confusion_table[2,2]+ confusion_table[1,2])
precisions <- confusion_table[2,2] / (confusion_table[2,2] + confusion_table[2,1])
specificities <-  1 - confusion_table[2,1]/(confusion_table[2,1]+ confusion_table[1,1]) 
classification.rate <- (confusion_table[2,2] + confusion_table[1,1]) / 
  (confusion_table[1,1]+ confusion_table[1,2] + confusion_table[2,1]+ confusion_table[2,2])
# overall fraction of wrong predictions:
# print(confusion_table)

sprintf("Null Classifier error rate : %s", 1 - nullClassifier)
## [1] "Null Classifier error rate : 0.458333333333333"
# average missclassification error rate
sprintf("tree classifier : Missclassification error rate : %s", missclassificationRate)
## [1] "tree classifier : Missclassification error rate : 0.25"
# FP rate:
sprintf("tree classifier : FP rate (TypeI error, 1 - specificity) : %s", FP_rates)
## [1] "tree classifier : FP rate (TypeI error, 1 - specificity) : 0.4"
# tree classification 
print(glue::glue("Tree classification rate: ", classification.rate))
## Tree classification rate: 0.75
# Null classifier rate
sprintf("Null Classifier rate: %s", nullClassifier)
## [1] "Null Classifier rate: 0.541666666666667"
# TP rate:
sprintf("tree classifier : TP rate (1-TypeII error, power, sensetivity, recall) : %s", TP_rates)
## [1] "tree classifier : TP rate (1-TypeII error, power, sensetivity, recall) : 0.876923076923077"
# precision:
sprintf("tree classifier : precision: %s", precisions)
## [1] "tree classifier : precision: 0.721518987341772"
# specificity 1-FP/N:
sprintf("tree classifier : specificity 1-FP/N: %s", specificities)
## [1] "tree classifier : specificity 1-FP/N: 0.6"
# -----------------------------------------------------------------------------
# ---------- Lets prune the tree to see if we get better results ---------------
cv.carseats <- cv.tree(object = tree.carseats, FUN = prune.misclass, K = 10)

# Number of terminal nodes of each tree that CV considered
cv.carseats$size
## [1] 23 19 15 13  9  7  5  2  1
# Error rate corresponding to each tree that CV considered
cv.carseats$dev
## [1]  83  82  82  84  83  85  89  98 111
# value of cost complexity parameter alpha that corresponds to each tree considered by CV
cv.carseats$k
## [1]      -Inf  0.000000  0.500000  1.000000  2.000000  3.000000  5.000000
## [8]  9.666667 25.000000
# plot the missclassification error rate as the function of size and k 
plot(cv.carseats$size, cv.carseats$dev, type = 'b')

plot(cv.carseats$k, cv.carseats$dev, type = 'b')

# seems like 9 is the best size of the tree, let's prune it to get to tree with size 9
prune.carseats <- prune.misclass(tree.carseats, best = 9)

#original data
str(carseats.df)
## tibble [400 × 12] (S3: tbl_df/tbl/data.frame)
##  $ Sales      : num [1:400] 9.5 11.22 10.06 7.4 4.15 ...
##  $ CompPrice  : int [1:400] 138 111 113 117 141 124 115 136 132 132 ...
##  $ Income     : int [1:400] 73 48 35 100 64 113 105 81 110 113 ...
##  $ Advertising: int [1:400] 11 16 10 4 3 13 0 15 0 0 ...
##  $ Population : int [1:400] 276 260 269 466 340 501 45 425 108 131 ...
##  $ Price      : int [1:400] 120 83 80 97 128 72 108 120 124 124 ...
##  $ ShelveLoc  : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
##  $ Age        : int [1:400] 42 65 59 55 38 78 71 67 76 76 ...
##  $ Education  : int [1:400] 17 10 12 14 13 16 15 10 10 17 ...
##  $ Urban      : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
##  $ US         : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
##  $ High       : Factor w/ 2 levels "Yes","No": 1 1 1 2 2 1 2 1 2 2 ...
#pruned tree
summary(prune.carseats)
## 
## Classification tree:
## snip.tree(tree = tree.carseats, nodes = c(26L, 54L, 49L, 55L, 
## 2L, 96L))
## Variables actually used in tree construction:
## [1] "Price"       "ShelveLoc"   "Age"         "Advertising" "CompPrice"  
## Number of terminal nodes:  9 
## Residual mean deviance:  0.7524 = 203.9 / 271 
## Misclassification error rate: 0.1393 = 39 / 280
plot(prune.carseats)
text(prune.carseats, pretty = 0)

# let's run it on test set to see the ,iss classification rate again:


# do the predoiction on test data (type="class" makes R return class predictions):
pred.probs <- predict(prune.carseats,carseats.df[test.idx,])
tree.pred <- predict(prune.carseats, carseats.df[test.idx,], type = "class")

# see the confusion matrix
library(pROC)
(confusion_table <- table(tree.pred, carseats.df[test.idx,]$High))
##          
## tree.pred Yes No
##       Yes  33 13
##       No   22 52
nullClassifier <- max(
    (confusion_table[1,1] + confusion_table[2,1])/(confusion_table[1,1] + confusion_table[2,1]+ confusion_table[1,2] + confusion_table[2,2] ), 
    (confusion_table[1,2] + confusion_table[2,2])/(confusion_table[1,1] + confusion_table[2,1]+ confusion_table[1,2] + confusion_table[2,2] ))
  
roc_obj <- roc(unlist(carseats.df[test.idx,]$High), pred.probs[, 1])
## Setting levels: control = Yes, case = No
## Setting direction: controls > cases
# Let's draw some AUC plots
plot(roc_obj, legacy.axes = TRUE)

missclassificationRate <- mean(tree.pred != carseats.df[test.idx,]$High)
FP_rates <- confusion_table[2,1]/(confusion_table[2,1]+ confusion_table[1,1])
TP_rates <- confusion_table[2,2]/(confusion_table[2,2]+ confusion_table[1,2])
precisions <- confusion_table[2,2] / (confusion_table[2,2] + confusion_table[2,1])
specificities <-  1 - confusion_table[2,1]/(confusion_table[2,1]+ confusion_table[1,1]) 
classification.rate <- (confusion_table[2,2] + confusion_table[1,1]) / 
  (confusion_table[1,1]+ confusion_table[1,2] + confusion_table[2,1]+ confusion_table[2,2])
# overall fraction of wrong predictions:
# print(confusion_table)


sprintf("Null Classifier error rate : %s", 1 - nullClassifier)
## [1] "Null Classifier error rate : 0.458333333333333"
# average missclassification error rate
sprintf("tree classifier : Missclassification error rate : %s", missclassificationRate)
## [1] "tree classifier : Missclassification error rate : 0.291666666666667"
# FP rate:
sprintf("tree classifier : FP rate (TypeI error, 1 - specificity) : %s", FP_rates)
## [1] "tree classifier : FP rate (TypeI error, 1 - specificity) : 0.4"
# tree classification 
print(glue::glue("Tree classification rate: ", classification.rate))
## Tree classification rate: 0.708333333333333
# Null classifier rate
sprintf("Null Classifier: %s", nullClassifier)
## [1] "Null Classifier: 0.541666666666667"
# TP rate:
sprintf("tree classifier : TP rate (1-TypeII error, power, sensetivity, recall) : %s", TP_rates)
## [1] "tree classifier : TP rate (1-TypeII error, power, sensetivity, recall) : 0.8"
# precision:
sprintf("tree classifier : precision: %s", precisions)
## [1] "tree classifier : precision: 0.702702702702703"
# specificity 1-FP/N:
sprintf("tree classifier : specificity 1-FP/N: %s", specificities)
## [1] "tree classifier : specificity 1-FP/N: 0.6"

Remember to each given value of \(\alpha\) (k in R) we assign a subtree \(T_{\alpha}\) as below:
Using Gini index:
\(T_{\alpha} = \underset{T \subset T_{0}}{\operatorname{argMin}} \{\sum\limits_{i=1}^{|T|} \sum\limits_{k=1}^{K} \hat{p_{m,k}} \cdot (1 - \hat{p_{m,k})} + \alpha \cdot |T| \space\space | T \subset T_{0} \}\)

or using Cross Entropy:

\(T_{\alpha} = \underset{T \subset T_{0}}{\operatorname{argMin}} \{\sum\limits_{i=1}^{|T|} \sum\limits_{k=1}^{K} \hat{-p_{m,k}} \cdot \log (\hat{p_{m,k}}) + \alpha \cdot |T| \space\space | T \subset T_{0} \}\)

And then we use CV to find few good values of \(\alpha\) and their corresponding \(T_{\alpha}\) based on miss classification error rate (FUN = prune.misclass).

library(tidyverse)
library(dataPreparation)
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loading required package: progress
## dataPreparation 0.4.3
## Type dataPrepNews() to see new features/changes/bug fixes.
library(tree)
boston.df = read.csv(
  "/Users/shahrdadshadab/env/my-R-project/ISLR/Data/datasets/BostonHousing.csv", 
  header=T, stringsAsFactors = F, na.strings = "?")

boston.df <- tibble(boston.df)

# see if there is any NA in any records
na.idx <- which(is.na(boston.df))

# No NA anywhere , let's do usual clean up:

# remove constant variables
constant_cols <- whichAreConstant(boston.df)
## [1] "whichAreConstant: it took me 0.01s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(boston.df))
## [1] "whichAreInDouble: it took me 0s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(boston.df))
## [1] "whichAreBijection: it took me 0.14s to identify 0 column(s) to drop."
## integer(0)
boston.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) boston.df[- bijections_cols] else boston.df

str(boston.df)
## Classes 'data.table' and 'data.frame':   506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : int  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ b      : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
set.seed(1113)
no.of.train <- ceiling(0.7 * nrow(boston.df))
no.of.test <- nrow(boston.df) - no.of.train

train.idx <- sample(seq_len(nrow(boston.df)), no.of.train)
test.idx <- setdiff(seq_len(nrow(boston.df)), train.idx)

tree.boston <- tree(medv ~ ., boston.df, subset = train.idx)

plot(tree.boston)
text(tree.boston, pretty = 0)

tree.boston
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 355 31270.0 22.66  
##    2) rm < 6.8375 295 12020.0 19.70  
##      4) lstat < 15 182  4907.0 22.95  
##        8) lstat < 9.68 97  3357.0 25.16  
##         16) age < 89.45 92  1374.0 24.26  
##           32) rm < 6.5445 61   633.1 22.72 *
##           33) rm > 6.5445 31   313.6 27.28 *
##         17) age > 89.45 5   507.2 41.90 *
##        9) lstat > 9.68 85   528.1 20.42 *
##      5) lstat > 15 113  2091.0 14.46  
##       10) crim < 5.76921 53   633.1 17.34 *
##       11) crim > 5.76921 60   631.8 11.92 *
##    3) rm > 6.8375 60  3914.0 37.24  
##      6) rm < 7.445 39  1087.0 32.64 *
##      7) rm > 7.445 21   475.9 45.77 *
# let's draw the residuals pdf, very close to standard normal
plot(density(summary(tree.boston)$residuals))

# deviance is sum of squared error for the tree

# do the predoiction on test data 
pred.values <- predict(tree.boston, newdata = boston.df[test.idx,])
obs.values <- boston.df[test.idx,]$medv
# find the R^2 for test data
(R.squared <- round((cor(pred.values, obs.values) ^ 2), 3))
## [1] 0.705
# now plot the observed vs predicted on test data
plot.data <- data.frame("Pred.values" = pred.values, "obs.values" = obs.values, 
                                "DataSet" = "test")
residuals <- obs.values - pred.values

plot.residuals <- data.frame("x" = obs.values, "y" = pred.values, 
                "x1" = obs.values, "y2" = pred.values + residuals,
                "DataSet" = "test")

ggplot() + 
        # plot test samples
        geom_point(data = plot.data, 
            aes(x = obs.values, y = pred.values, color = DataSet)) +
        # plot residuals
        geom_segment(data = plot.residuals, alpha = 0.2,
            aes(x = x, y = y, xend = x1, yend = y2, group = DataSet)) +
        # plot optimal regressor
        geom_abline(color = "blue", slope = 1)

# plot the residuals
plot(density(residuals))

plot(pred.values, obs.values)
abline(0,1)

# tetMSE
mean((obs.values - pred.values)^2)
## [1] 24.43265
#---------------------------------------------------------
# Now let's prune the tree to see if it performs better
# --------------------------------------------------------
cv.boston <- cv.tree(tree.boston, K = 10)
plot(cv.boston$size, cv.boston$dev, type='b')

# The deviance is lowest at 7, so let's construct a pruned tree with 7 nodes
prune.boston <- prune.tree(tree = tree.boston, best=7)

# summary of pruned tree
summary(prune.boston)
## 
## Regression tree:
## snip.tree(tree = tree.boston, nodes = 16L)
## Variables actually used in tree construction:
## [1] "rm"    "lstat" "age"   "crim" 
## Number of terminal nodes:  7 
## Residual mean deviance:  15.05 = 5237 / 348 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -14.9000  -2.2550  -0.1554   0.0000   2.2500  17.3600
plot(prune.boston)
text(prune.boston, pretty = 0)

# let's draw the residuals pdf for pruned tree
plot(density(summary(prune.boston)$residuals))

# do the predoiction on test data 
pred.values <- predict(prune.boston, newdata = boston.df[test.idx,])
obs.values <- boston.df[test.idx,]$medv
# find the R^2 for test data
(R.squared <- round((cor(pred.values, obs.values) ^ 2), 3))
## [1] 0.694
# now plot the observed vs predicted on test data
plot.data <- data.frame("Pred.values" = pred.values, "obs.values" = obs.values, 
                                "DataSet" = "test")
residuals <- obs.values - pred.values

plot.residuals <- data.frame("x" = obs.values, "y" = pred.values, 
                "x1" = obs.values, "y2" = pred.values + residuals,
                "DataSet" = "test")

ggplot() + 
        # plot test samples
        geom_point(data = plot.data, 
            aes(x = obs.values, y = pred.values, color = DataSet)) +
        # plot residuals
        geom_segment(data = plot.residuals, alpha = 0.2,
            aes(x = x, y = y, xend = x1, yend = y2, group = DataSet)) +
        # plot optimal regressor
        geom_abline(color = "blue", slope = 1)

# plot the residuals
plot(density(residuals))

plot(pred.values, obs.values)
abline(0,1)

# tetMSE
mean((obs.values - pred.values)^2)
## [1] 25.29452
# seemed like pruning downgraded the performance of the model 
# So pruning the tree does not add to performance

# From another perspective since testMSE ~ 25 then we can infere that 
# real house price is 5k around home value in suberb 
library(tidyverse)
library(dataPreparation)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
boston.df = read.csv(
  "/Users/shahrdadshadab/env/my-R-project/ISLR/Data/datasets/BostonHousing.csv", 
  header=T, stringsAsFactors = F, na.strings = "?")

boston.df <- tibble(boston.df)

# see if there is any NA in any records
na.idx <- which(is.na(boston.df))

# No NA anywhere , let's do usual clean up:

# remove constant variables
constant_cols <- whichAreConstant(boston.df)
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(boston.df))
## [1] "whichAreInDouble: it took me 0s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(boston.df))
## [1] "whichAreBijection: it took me 0.05s to identify 0 column(s) to drop."
## integer(0)
boston.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) boston.df[- bijections_cols] else boston.df

set.seed(1113)
no.of.train <- ceiling(0.7 * nrow(boston.df))
no.of.test <- nrow(boston.df) - no.of.train

train.idx <- sample(seq_len(nrow(boston.df)), no.of.train)
test.idx <- setdiff(seq_len(nrow(boston.df)), train.idx)

# ---------------------------- Bagging  ---------------------------

# Bagging (mtry is number of predictors should be considered for each split )
bag.boston <- randomForest(medv ~ ., data = boston.df,subset = train.idx,  
                           mtry=ncol(boston.df)-1, importance=T)

# importance of each predictor
# % IncMSE : Increase in out of bag MSE When given predictor excluded from model 
# IncNodePurity : Decrease on training RSS (regression) or deviance (classification) 
#                 in each split
importance(bag.boston)
##           %IncMSE IncNodePurity
## crim    12.701304    1103.39712
## zn       2.039922      25.44568
## indus    7.278869     157.72731
## chas     2.124328      49.42790
## nox     22.919938     611.15533
## rm      64.312074   16619.62955
## age      8.083396     329.63566
## dis     26.378734    2362.84953
## rad      3.319187      91.67949
## tax     19.010440     556.55256
## ptratio 15.070773     356.08397
## b        4.879778     246.14704
## lstat   31.459875    8339.57390
# plot importance predictors
varImpPlot(bag.boston)

# do the predoiction on test data 
pred.values <- predict(bag.boston, newdata = boston.df[test.idx,])
obs.values <- boston.df[test.idx,]$medv
# find the R^2 for test data (Bagging)
(R.squared <- round((cor(pred.values, obs.values) ^ 2), 3))
## [1] 0.813
# now plot the observed vs predicted on test data
plot.data <- data.frame("pred.values" = pred.values, "obs.values" = obs.values, 
                                "DataSet" = "test")
residuals <- obs.values - pred.values

plot.residuals <- data.frame("x" = obs.values, "y" = pred.values, 
                "x1" = obs.values, "y2" = pred.values + residuals,
                "DataSet" = "test")

ggplot() + 
        # plot test samples
        geom_point(data = plot.data, 
            aes(x = obs.values, y = pred.values, color = DataSet)) +
        # plot residuals
        geom_segment(data = plot.residuals, alpha = 0.2,
            aes(x = x, y = y, xend = x1, yend = y2, group = DataSet)) +
        # plot optimal regressor
        geom_abline(color = "blue", slope = 1)

# plot the residuals
plot(density(residuals))

plot(pred.values, obs.values)
abline(0,1)

# tetMSE (Bagging)
mean((obs.values - pred.values)^2)
## [1] 14.26849
# ---------------------------- Bagging with 25 trees ---------------------------
# change number of trees in bagging or random forest using ntree parameter

# Bagging (mtry is number of predictors should be considered for each split )
bag.boston.25 <- randomForest(medv ~ ., data = boston.df,subset = train.idx,  
                           mtry=ncol(boston.df)-1, importance=T, ntree = 25)


bag.boston.25
## 
## Call:
##  randomForest(formula = medv ~ ., data = boston.df, mtry = ncol(boston.df) -      1, importance = T, ntree = 25, subset = train.idx) 
##                Type of random forest: regression
##                      Number of trees: 25
## No. of variables tried at each split: 13
## 
##           Mean of squared residuals: 13.99809
##                     % Var explained: 84.11
# importance of each predictor
# % IncMSE : Increase in out of bag MSE When given predictor excluded from model 
# IncNodePurity : Decrease on training RSS (regression) or deviance (classification) 
#                 in each split

importance(bag.boston.25)
##             %IncMSE IncNodePurity
## crim     2.82644654     869.24283
## zn      -0.18898953      20.60689
## indus    2.05009318     125.61061
## chas     0.67556199      73.26221
## nox      5.76321724     564.08186
## rm      17.00079669   17959.80899
## age      3.85258734     245.62522
## dis      8.05348940    2158.97998
## rad     -0.05974703      78.91323
## tax      7.62804930     615.26243
## ptratio  5.06408048     408.50769
## b        2.62026016     234.92618
## lstat    7.95259625    6772.36098
# plot importance predictors
varImpPlot(bag.boston.25)

# do the predoiction on test data 
pred.values <- predict(bag.boston.25, newdata = boston.df[test.idx,])
obs.values <- boston.df[test.idx,]$medv
# find the R^2 for test data (Bagging with 25 trees)
(R.squared <- round((cor(pred.values, obs.values) ^ 2), 3))
## [1] 0.771
# now plot the observed vs predicted on test data
plot.data <- data.frame("pred.values" = pred.values, "obs.values" = obs.values, 
                                "DataSet" = "test")
residuals <- obs.values - pred.values

plot.residuals <- data.frame("x" = obs.values, "y" = pred.values, 
                "x1" = obs.values, "y2" = pred.values + residuals,
                "DataSet" = "test")

ggplot() + 
        # plot test samples
        geom_point(data = plot.data, 
            aes(x = obs.values, y = pred.values, color = DataSet)) +
        # plot residuals
        geom_segment(data = plot.residuals, alpha = 0.2,
            aes(x = x, y = y, xend = x1, yend = y2, group = DataSet)) +
        # plot optimal regressor
        geom_abline(color = "blue", slope = 1)

# plot the residuals
plot(density(residuals))

plot(pred.values, obs.values)
abline(0,1)

# tetMSE (Bagging with 25 trees)
mean((obs.values - pred.values)^2)
## [1] 17.85693
# ------------------------ Random forest ---------------------------------
# By Default :
#   mtry = sqrt(Number of features) For classification
#   mtry = one third of number of features For Regression
rf.boston <- randomForest(medv ~ ., data = boston.df,subset = train.idx,  
                           mtry=6, importance=T)


rf.boston
## 
## Call:
##  randomForest(formula = medv ~ ., data = boston.df, mtry = 6,      importance = T, subset = train.idx) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 6
## 
##           Mean of squared residuals: 11.30472
##                     % Var explained: 87.17
# importance of each predictor
# % IncMSE : Increase in out of bag MSE When given predictor excluded from model 
# IncNodePurity : Decrease on training RSS (regression) or deviance (classification) 
#                 in each split
importance(rf.boston)
##           %IncMSE IncNodePurity
## crim    13.457600    1584.57747
## zn       3.783388     115.53530
## indus    9.057431    1598.85331
## chas     1.357703      66.68147
## nox     14.026089    1467.71364
## rm      35.616616   10690.72056
## age      9.136849     674.94388
## dis     17.149022    2240.78477
## rad      4.067559     153.45064
## tax     12.664201     805.21803
## ptratio 11.523643    1595.11319
## b        7.572004     403.50368
## lstat   29.831143    9378.69053
# plot importance predictors
# across all of the rees considered in the random forest rm and lstat are by far 
# most important features
varImpPlot(rf.boston)

# do the predoiction on test data 
pred.values <- predict(rf.boston, newdata = boston.df[test.idx,])
obs.values <- boston.df[test.idx,]$medv
# find the R^2 for test data (Random Forest with 6 predictors)
(R.squared <- round((cor(pred.values, obs.values) ^ 2), 3))
## [1] 0.872
# now plot the observed vs predicted on test data
plot.data <- data.frame("pred.values" = pred.values, "obs.values" = obs.values, 
                                "DataSet" = "test")
residuals <- obs.values - pred.values

plot.residuals <- data.frame("x" = obs.values, "y" = pred.values, 
                "x1" = obs.values, "y2" = pred.values + residuals,
                "DataSet" = "test")

ggplot() + 
        # plot test samples
        geom_point(data = plot.data, 
            aes(x = obs.values, y = pred.values, color = DataSet)) +
        # plot residuals
        geom_segment(data = plot.residuals, alpha = 0.2,
            aes(x = x, y = y, xend = x1, yend = y2, group = DataSet)) +
        # plot optimal regressor
        geom_abline(color = "blue", slope = 1)

# plot the residuals
plot(density(residuals))

plot(pred.values, obs.values)
abline(0,1)

# tetMSE (Random Forest with 6 predictors)
mean((obs.values - pred.values)^2)
## [1] 10.01629
library(tidyverse)
library(dataPreparation)
library(gbm)
## Loaded gbm 2.1.8
boston.df = read.csv(
  "/Users/shahrdadshadab/env/my-R-project/ISLR/Data/datasets/BostonHousing.csv", 
  header=T, stringsAsFactors = F, na.strings = "?")

boston.df <- tibble(boston.df)

# see if there is any NA in any records
na.idx <- which(is.na(boston.df))

# No NA anywhere , let's do usual clean up:

# remove constant variables
constant_cols <- whichAreConstant(boston.df)
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(boston.df))
## [1] "whichAreInDouble: it took me 0s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(boston.df))
## [1] "whichAreBijection: it took me 0.06s to identify 0 column(s) to drop."
## integer(0)
caravan.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) boston.df[- bijections_cols] else boston.df

set.seed(1113)
no.of.train <- ceiling(0.7 * nrow(boston.df))
no.of.test <- nrow(boston.df) - no.of.train

train.idx <- sample(seq_len(nrow(boston.df)), no.of.train)
test.idx <- setdiff(seq_len(nrow(boston.df)), train.idx)

# 5000 trees each with depth of 4, default value for shrinkage parameter is 0.001
# lambda (Algorithm 8.2 page 322)
boost.boston <- gbm(medv ~ ., data = boston.df[train.idx, ],
                    distribution = "gaussian", n.trees = 5000, 
                    interaction.depth = 4) 

# rm and lstat are most important features
summary(boost.boston)
# partial dependence plots:
# marginal affect of selected feature on response after interating out other features
plot(boost.boston, i="rm")

# median house price is decreasing with lstat
plot(boost.boston, i="lstat")

# do the predoiction on test data 
pred.values <- predict(boost.boston, newdata = boston.df[test.idx,], n.trees = 5000)
obs.values <- boston.df[test.idx,]$medv
# find the R^2 for test data (Random Forest with 6 predictors)
(R.squared <- round((cor(pred.values, obs.values) ^ 2), 3))
## [1] 0.822
# now plot the observed vs predicted on test data
plot.data <- data.frame("pred.values" = pred.values, "obs.values" = obs.values, 
                                "DataSet" = "test")
residuals <- obs.values - pred.values

plot.residuals <- data.frame("x" = obs.values, "y" = pred.values, 
                "x1" = obs.values, "y2" = pred.values + residuals,
                "DataSet" = "test")

ggplot() + 
        # plot test samples
        geom_point(data = plot.data, 
            aes(x = obs.values, y = pred.values, color = DataSet)) +
        # plot residuals
        geom_segment(data = plot.residuals, alpha = 0.2,
            aes(x = x, y = y, xend = x1, yend = y2, group = DataSet)) +
        # plot optimal regressor
        geom_abline(color = "blue", slope = 1)

# plot the residuals
plot(density(residuals))

plot(pred.values, obs.values)
abline(0,1)

# tetMSE (Random Forest with 6 predictors)
mean((obs.values - pred.values)^2)
## [1] 13.88032
# ------------------------------------------------------------------ 
# boosting with specefic shrinkage parameter lambda
# ------------------------------------------------------------------

# 5000 trees each with depth of 4, default value for shrinkage parameter is 0.2
# lambda (Algorithm 8.2 page 322)
boost.boston <- gbm(medv ~ ., data = boston.df[train.idx, ],
                    distribution = "gaussian", n.trees = 5000, 
                    interaction.depth = 4, shrinkage = 0.2) 

# rm and lstat are most important features
summary(boost.boston)
# partial dependence plots:
# marginal affect of selected feature on response after interating out other features
plot(boost.boston, i="rm")

# median house price is decreasing with lstat
plot(boost.boston, i="lstat")

# do the predoiction on test data 
pred.values <- predict(boost.boston, newdata = boston.df[test.idx,], n.trees = 5000)
obs.values <- boston.df[test.idx,]$medv
# find the R^2 for test data (Random Forest with 6 predictors)
(R.squared <- round((cor(pred.values, obs.values) ^ 2), 3))
## [1] 0.811
# now plot the observed vs predicted on test data
plot.data <- data.frame("pred.values" = pred.values, "obs.values" = obs.values, 
                                "DataSet" = "test")
residuals <- obs.values - pred.values

plot.residuals <- data.frame("x" = obs.values, "y" = pred.values, 
                "x1" = obs.values, "y2" = pred.values + residuals,
                "DataSet" = "test")

ggplot() + 
        # plot test samples
        geom_point(data = plot.data, 
            aes(x = obs.values, y = pred.values, color = DataSet)) +
        # plot residuals
        geom_segment(data = plot.residuals, alpha = 0.2,
            aes(x = x, y = y, xend = x1, yend = y2, group = DataSet)) +
        # plot optimal regressor
        geom_abline(color = "blue", slope = 1)

# plot the residuals
plot(density(residuals))

plot(pred.values, obs.values)
abline(0,1)

# tetMSE (Random Forest with 6 predictors)
mean((obs.values - pred.values)^2)
## [1] 14.86174
library(tidyverse)
library(dataPreparation)
library(randomForest)
library(tree)
boston.df = read.csv(
  "/Users/shahrdadshadab/env/my-R-project/ISLR/Data/datasets/BostonHousing.csv", 
  header=T, stringsAsFactors = F, na.strings = "?")

boston.df <- tibble(boston.df)

# see if there is any NA in any records
na.idx <- which(is.na(boston.df))

# No NA anywhere , let's do usual clean up:

# remove constant variables
constant_cols <- whichAreConstant(boston.df)
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(boston.df))
## [1] "whichAreInDouble: it took me 0s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(boston.df))
## [1] "whichAreBijection: it took me 0.05s to identify 0 column(s) to drop."
## integer(0)
boston.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) boston.df[- bijections_cols] else boston.df

set.seed(1113)
no.of.train <- ceiling(0.7 * nrow(boston.df))
no.of.test <- nrow(boston.df) - no.of.train

train.idx <- sample(seq_len(nrow(boston.df)), no.of.train)
test.idx <- setdiff(seq_len(nrow(boston.df)), train.idx)

# ------------------------ Random forest ---------------------------------
# By Default :
#   mtry = sqrt(Number of features) For classification
#   mtry = one third of number of features For Regression
plot.data.trees <- tibble("No.Of.Trees" = NULL, "test.MSE" = NULL, "R^2" = NULL)

for (no.of.trees in 10:100){
  rf.boston <- randomForest(medv ~ ., data = boston.df,subset = train.idx,  
                            n.trees = no.of.trees, importance=F)

  pred.values <- predict(rf.boston, newdata = boston.df[test.idx,])
  obs.values <- boston.df[test.idx,]$medv
  R.squared <- round((cor(pred.values, obs.values) ^ 2), 3)
  test.MSE <- mean((obs.values - pred.values)^2)

  plot.data.trees <- rbind(plot.data.trees, 
                           tibble("No.Of.Trees" = no.of.trees, 
                                  "test.MSE" = test.MSE, 
                                  "R^2" = R.squared))
}


ggplot() + 
        geom_line(data = plot.data.trees, 
            aes(x = No.Of.Trees, y = test.MSE)) + 
  labs(title="No of trees and test MSE")

# Number of trees that makes test.MSE minimim in range of 100 trees
plot.data.trees[which.min(plot.data.trees$test.MSE), ]
# --------------------------- mtry ------------------------------- #
plot.data.mtry <- tibble("mtry" = NULL, "test.MSE" = NULL, "R^2" = NULL)

for (m.try in 2:ncol(boston.df) - 1){
  rf.boston <- randomForest(medv ~ ., data = boston.df,subset = train.idx,  
                            mtry = m.try, importance=F)

  pred.values <- predict(rf.boston, newdata = boston.df[test.idx,])
  obs.values <- boston.df[test.idx,]$medv
  R.squared <- round((cor(pred.values, obs.values) ^ 2), 3)
  test.MSE <- mean((obs.values - pred.values)^2)

  plot.data.mtry <- rbind(plot.data.mtry, 
                           tibble("mtry" = m.try, 
                                  "test.MSE" = test.MSE, 
                                  "R^2" = R.squared))

}

ggplot() + 
        geom_line(data = plot.data.mtry, 
            aes(x = mtry, y = test.MSE)) + 
  labs(title="mtry and test MSE")

# value of mtry  that makes test.MSE minimim in range of 100 trees
plot.data.mtry[which.min(plot.data.mtry$test.MSE), ]
# over all the best result obtained for number of predictors = 4 and No of trees = 18:
rf.boston <- randomForest(medv ~ ., data = boston.df,subset = train.idx,  
                          mtry = 4, n.trees = 18, importance=F)

pred.values <- predict(rf.boston, newdata = boston.df[test.idx,])
obs.values <- boston.df[test.idx,]$medv
# R sruared:
(R.squared <- round((cor(pred.values, obs.values) ^ 2), 3))
## [1] 0.883
# testMSE:
(test.MSE <- mean((obs.values - pred.values)^2))
## [1] 9.542795
# now plot the observed vs predicted on test data
plot.data <- data.frame("pred.values" = pred.values, "obs.values" = obs.values, 
                                "DataSet" = "test")
residuals <- obs.values - pred.values

plot.residuals <- data.frame("x" = obs.values, "y" = pred.values, 
                "x1" = obs.values, "y2" = pred.values + residuals,
                "DataSet" = "test")

ggplot() + 
        # plot test samples
        geom_point(data = plot.data, 
            aes(x = obs.values, y = pred.values, color = DataSet)) +
        # plot residuals
        geom_segment(data = plot.residuals, alpha = 0.2,
            aes(x = x, y = y, xend = x1, yend = y2, group = DataSet)) +
        # plot optimal regressor
        geom_abline(color = "blue", slope = 1)

library(tidyverse)
library(tree)
library(randomForest)
library(dataPreparation)
options(warn = 1)
set.seed(1)
carseats.df = read.csv("/Users/shahrdadshadab/env/my-R-project/ISLR/Data/datasets/Carseats.csv", 
                      header=T, stringsAsFactors = F, na.strings = "?")

carseats.df <- tibble(carseats.df)


# find NAs

carseats.df %>% 
   filter_all(all_vars(is.na(.))) 
# convert all character columns to factor
char.to.fctor <- function(df)
  df %>%
    mutate_if(is.character, ~ factor(.x, levels = (.x %>% table() %>% names())))

carseats.df <- char.to.fctor(carseats.df)

# remove constant variables
constant_cols <- whichAreConstant(carseats.df)
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(carseats.df))
## [1] "whichAreInDouble: it took me 0s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(carseats.df))
## [1] "whichAreBijection: it took me 0.03s to identify 0 column(s) to drop."
## integer(0)
carseats.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) carseats.df[- bijections_cols] else carseats.df

#----------------------------------------
# a) split the data into train and test
set.seed(1113)
no.of.train <- ceiling(0.7 * nrow(carseats.df))
no.of.test <- nrow(carseats.df) - no.of.train

train.idx <- sample(seq_len(nrow(carseats.df)), no.of.train)
test.idx <- setdiff(seq_len(nrow(carseats.df)), train.idx)

# b) Fit a regression tree , plot the three and interpret the results, testMSE
tree.carseats <- tree(Sales ~ ., carseats.df, subset = train.idx)

plot(tree.carseats)
text(tree.carseats, pretty = 0)

# summary
summary(tree.carseats)
## 
## Regression tree:
## tree(formula = Sales ~ ., data = carseats.df, subset = train.idx)
## Variables actually used in tree construction:
## [1] "ShelveLoc"   "Price"       "Age"         "Population"  "CompPrice"  
## [6] "Advertising" "Education"   "US"         
## Number of terminal nodes:  20 
## Residual mean deviance:  1.895 = 492.7 / 260 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3.55300 -0.89080 -0.05948  0.00000  0.90790  3.93100
# the tree
tree.carseats
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##   1) root 280 2005.000  7.341  
##     2) ShelveLoc: Bad,Medium 224 1322.000  6.769  
##       4) Price < 94.5 41  122.500  9.141  
##         8) Age < 34.5 7   20.350 10.730 *
##         9) Age > 34.5 34   80.910  8.815  
##          18) Population < 56.5 6   14.780 10.550 *
##          19) Population > 56.5 28   44.200  8.443 *
##       5) Price > 94.5 183  916.800  6.237  
##        10) ShelveLoc: Bad 51  191.400  4.717  
##          20) Population < 180.5 19   59.460  3.535  
##            40) Price < 120.5 11   20.570  4.463 *
##            41) Price > 120.5 8   16.390  2.259 *
##          21) Population > 180.5 32   89.630  5.419  
##            42) Age < 63.5 21   59.040  6.035  
##              84) Price < 130 12   22.060  6.998 *
##              85) Price > 130 9   11.010  4.751 *
##            43) Age > 63.5 11    7.378  4.242 *
##        11) ShelveLoc: Medium 132  562.000  6.824  
##          22) Age < 47.5 45  133.000  8.001  
##            44) Price < 142.5 36   78.760  8.527 *
##            45) Price > 142.5 9    4.319  5.894 *
##          23) Age > 47.5 87  334.500  6.216  
##            46) Price < 137.5 74  202.300  6.562  
##              92) CompPrice < 123.5 33   36.730  5.587 *
##              93) CompPrice > 123.5 41  109.000  7.346  
##               186) Price < 115.5 14   15.890  8.686 *
##               187) Price > 115.5 27   54.940  6.652 *
##            47) Price > 137.5 13   72.920  4.245  
##              94) Advertising < 2.5 7   22.520  2.747 *
##              95) Advertising > 2.5 6   16.360  5.993 *
##     3) ShelveLoc: Good 56  315.700  9.632  
##       6) Price < 135 42  155.400 10.420  
##        12) Education < 12.5 17   41.740 11.700 *
##        13) Education > 12.5 25   66.690  9.545  
##          26) US: No 8    3.523  7.818 *
##          27) US: Yes 17   28.050 10.360 *
##       7) Price > 135 14   56.770  7.276  
##        14) Age < 62.5 9   26.320  8.244 *
##        15) Age > 62.5 5    6.844  5.534 *
# let's draw the residuals pdf, very close to standard normal
plot(density(summary(tree.carseats)$residuals))

# deviance is sum of squared error for the tree

# do the predoiction on test data 
pred.values <- predict(tree.carseats, newdata = carseats.df[test.idx,])
obs.values <- carseats.df[test.idx,]$Sales

# R sruared:
(R.squared <- round((cor(pred.values, obs.values) ^ 2), 3))
## [1] 0.387
# testMSE:
(test.MSE <- mean((obs.values - pred.values)^2))
## [1] 6.266808
# now plot the observed vs predicted on test data
plot.data <- data.frame("Pred.values" = pred.values, "obs.values" = obs.values, 
                                "DataSet" = "test")
residuals <- obs.values - pred.values

plot.residuals <- data.frame("x" = obs.values, "y" = pred.values, 
                "x1" = obs.values, "y2" = pred.values + residuals,
                "DataSet" = "test")

ggplot() + 
        # plot test samples
        geom_point(data = plot.data, 
            aes(x = obs.values, y = pred.values, color = DataSet)) +
        # plot residuals
        geom_segment(data = plot.residuals, alpha = 0.2,
            aes(x = x, y = y, xend = x1, yend = y2, group = DataSet)) +
        # plot optimal regressor
        geom_abline(color = "blue", slope = 1)

# ----------------------------------
# c) Let's use CV to prune th tree
cv.carseats <- cv.tree(object = tree.carseats, FUN = prune.tree, K = 10)

# Number of terminal nodes of each tree that CV considered
cv.carseats$size
##  [1] 20 18 17 16 14 13 12 11 10  9  8  7  6  5  4  3  2  1
# error rate corresponding to each tree that CV considered
cv.carseats$dev
##  [1] 1301.257 1301.464 1296.063 1285.945 1280.598 1298.562 1304.273 1294.526
##  [9] 1302.293 1374.875 1395.500 1431.839 1424.860 1435.591 1445.018 1592.663
## [17] 1899.638 2030.819
# value of cost complexity parameer alpha that corresponds to each tree considered by CV
cv.carseats$k
##  [1]      -Inf  21.57798  22.49797  23.61378  24.59477  34.04505  35.11703
##  [8]  38.13740  42.31567  46.93310  49.90694  56.59542  59.32975  94.50419
## [15] 103.55720 163.36062 282.58758 367.28168
# creata a tibble to have these values in one place
(cv.result <- tibble(error.rate = cv.carseats$dev, tree.size = cv.carseats$size, 
                    aplpha = cv.carseats$k)
)
cv.result[which.min(cv.result$error.rate), ]
# plot error rate against size and k 
plot(cv.carseats$size, cv.carseats$dev, type = 'b')

plot(cv.carseats$k, cv.carseats$dev, type = 'b')

# seems like tree with 14 terminals results in minimum cv error rate
prune.carseats <- prune.tree(tree.carseats, best = 14)

#original data
str(carseats.df)
## Classes 'data.table' and 'data.frame':   400 obs. of  11 variables:
##  $ Sales      : num  9.5 11.22 10.06 7.4 4.15 ...
##  $ CompPrice  : int  138 111 113 117 141 124 115 136 132 132 ...
##  $ Income     : int  73 48 35 100 64 113 105 81 110 113 ...
##  $ Advertising: int  11 16 10 4 3 13 0 15 0 0 ...
##  $ Population : int  276 260 269 466 340 501 45 425 108 131 ...
##  $ Price      : int  120 83 80 97 128 72 108 120 124 124 ...
##  $ ShelveLoc  : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
##  $ Age        : int  42 65 59 55 38 78 71 67 76 76 ...
##  $ Education  : int  17 10 12 14 13 16 15 10 10 17 ...
##  $ Urban      : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
##  $ US         : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
#pruned tree
summary(prune.carseats)
## 
## Regression tree:
## snip.tree(tree = tree.carseats, nodes = c(4L, 20L, 7L, 21L))
## Variables actually used in tree construction:
## [1] "ShelveLoc"   "Price"       "Population"  "Age"         "CompPrice"  
## [6] "Advertising" "Education"   "US"         
## Number of terminal nodes:  14 
## Residual mean deviance:  2.373 = 631.2 / 266 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -3.6960 -1.0080 -0.1319  0.0000  1.0450  4.3440
plot(prune.carseats)
text(prune.carseats, pretty = 0)

# do the predoiction on test data 
pred.values <- predict(prune.carseats, newdata = carseats.df[test.idx,])
obs.values <- carseats.df[test.idx,]$Sales

# R sruared (pruned tree):
(R.squared <- round((cor(pred.values, obs.values) ^ 2), 3))
## [1] 0.391
# testMSE (pruned tree) :
(test.MSE <- mean((obs.values - pred.values)^2))
## [1] 6.160287
# now plot the observed vs predicted on test data
plot.data <- data.frame("Pred.values" = pred.values, "obs.values" = obs.values, 
                                "DataSet" = "test")
residuals <- obs.values - pred.values

plot.residuals <- data.frame("x" = obs.values, "y" = pred.values, 
                "x1" = obs.values, "y2" = pred.values + residuals,
                "DataSet" = "test")

ggplot() + 
        # plot test samples
        geom_point(data = plot.data, 
            aes(x = obs.values, y = pred.values, color = DataSet)) +
        # plot residuals
        geom_segment(data = plot.residuals, alpha = 0.2,
            aes(x = x, y = y, xend = x1, yend = y2, group = DataSet)) +
        # plot optimal regressor
        geom_abline(color = "blue", slope = 1)

# pruning shows slight improvement on R squared and Test.MSE

# ---------------------------------------------
# d) Use bagging approach


bag.carseats <- randomForest(Sales ~ ., data = carseats.df,subset = train.idx,  
                           mtry=ncol(carseats.df)-1, importance=T)

# importance of each predictor
# % IncMSE : Increase in out of bag MSE When given predictor excluded from model 
# IncNodePurity : Decrease on training RSS (regression) or deviance (classification) 
#                 in each split
importance(bag.carseats)
##                %IncMSE IncNodePurity
## CompPrice   28.5214943    196.214712
## Income       5.9836079     85.997845
## Advertising 14.8622966    100.626687
## Population   4.2421305     91.249508
## Price       68.1821119    671.421375
## ShelveLoc   61.0398841    503.742273
## Age         25.4366228    232.470799
## Education    0.4686614     55.718900
## Urban       -1.0075799      8.037247
## US           8.3649741     19.585203
# plot importance predictors
varImpPlot(bag.carseats)

# do the predoiction on test data 
pred.values <- predict(bag.carseats, newdata = carseats.df[test.idx,])
obs.values <- carseats.df[test.idx,]$Sales

# R sruared (pruned tree):
(R.squared <- round((cor(pred.values, obs.values) ^ 2), 3))
## [1] 0.677
# testMSE (pruned tree) :
(test.MSE <- mean((obs.values - pred.values)^2))
## [1] 3.704952
# now plot the observed vs predicted on test data
plot.data <- data.frame("pred.values" = pred.values, "obs.values" = obs.values, 
                                "DataSet" = "test")
residuals <- obs.values - pred.values

plot.residuals <- data.frame("x" = obs.values, "y" = pred.values, 
                "x1" = obs.values, "y2" = pred.values + residuals,
                "DataSet" = "test")

ggplot() + 
        # plot test samples
        geom_point(data = plot.data, 
            aes(x = obs.values, y = pred.values, color = DataSet)) +
        # plot residuals
        geom_segment(data = plot.residuals, alpha = 0.2,
            aes(x = x, y = y, xend = x1, yend = y2, group = DataSet)) +
        # plot optimal regressor
        geom_abline(color = "blue", slope = 1)

# plot the residuals
plot(density(residuals))

# ---------------------------------------------
# e) Use Random Forest


# ------------------------ Random forest ---------------------------------
# By Default :
#   mtry = sqrt(Number of features) For classification
#   mtry = one third of number of features For Regression

# first let's find optimal number of trees 
plot.data.trees <- tibble("No.Of.Trees" = NULL, "test.MSE" = NULL, "R^2" = NULL)

for (no.of.trees in 10:100){
  rf.carseats <- randomForest(Sales ~ ., data = carseats.df,subset = train.idx,  
                            n.trees = no.of.trees, importance=F)

  pred.values <- predict(rf.carseats, newdata = carseats.df[test.idx,])
  obs.values <- carseats.df[test.idx,]$Sales
  R.squared <- round((cor(pred.values, obs.values) ^ 2), 3)
  test.MSE <- mean((obs.values - pred.values)^2)

  plot.data.trees <- rbind(plot.data.trees, 
                           tibble("No.Of.Trees" = no.of.trees, 
                                  "test.MSE" = test.MSE, 
                                  "R^2" = R.squared))
}


ggplot() + 
        geom_line(data = plot.data.trees, 
            aes(x = No.Of.Trees, y = test.MSE)) + 
  labs(title="No of trees and test MSE")

# Number of trees that makes test.MSE minimim in range of 100 trees
plot.data.trees[which.min(plot.data.trees$test.MSE), ]
# Let's find number of predictors to build the tree 
# --------------------------- mtry ------------------------------- #
plot.data.mtry <- tibble("mtry" = NULL, "test.MSE" = NULL, "R^2" = NULL)

for (m.try in 2:ncol(carseats.df) - 1){
  rf.carseats <- randomForest(Sales ~ ., data = carseats.df,subset = train.idx,  
                            mtry = m.try, importance=F)

  pred.values <- predict(rf.carseats, newdata = carseats.df[test.idx,])
  obs.values <- carseats.df[test.idx,]$Sales
  R.squared <- round((cor(pred.values, obs.values) ^ 2), 3)
  test.MSE <- mean((obs.values - pred.values)^2)

  plot.data.mtry <- rbind(plot.data.mtry, 
                           tibble("mtry" = m.try, 
                                  "test.MSE" = test.MSE, 
                                  "R^2" = R.squared))

}

ggplot() + 
        geom_line(data = plot.data.mtry, 
            aes(x = mtry, y = test.MSE)) + 
  labs(title="mtry and test MSE")

# value of mtry  that makes test.MSE minimim in range of 100 trees
plot.data.mtry[which.min(plot.data.mtry$test.MSE), ]
# over all the best result obtained for number of predictors = 8 and No of trees = 33:
rf.carseats <- randomForest(Sales ~ ., data = carseats.df,subset = train.idx,  
                          mtry = 8, n.trees = 33, importance=F)

pred.values <- predict(rf.carseats, newdata = carseats.df[test.idx,])
obs.values <- carseats.df[test.idx,]$Sales
# R sruared:
(R.squared <- round((cor(pred.values, obs.values) ^ 2), 3))
## [1] 0.687
# testMSE:
(test.MSE <- mean((obs.values - pred.values)^2))
## [1] 3.677755
# now plot the observed vs predicted on test data
plot.data <- data.frame("pred.values" = pred.values, "obs.values" = obs.values, 
                                "DataSet" = "test")
residuals <- obs.values - pred.values

plot.residuals <- data.frame("x" = obs.values, "y" = pred.values, 
                "x1" = obs.values, "y2" = pred.values + residuals,
                "DataSet" = "test")

ggplot() + 
        # plot test samples
        geom_point(data = plot.data, 
            aes(x = obs.values, y = pred.values, color = DataSet)) +
        # plot residuals
        geom_segment(data = plot.residuals, alpha = 0.2,
            aes(x = x, y = y, xend = x1, yend = y2, group = DataSet)) +
        # plot optimal regressor
        geom_abline(color = "blue", slope = 1)

# importance: how much training RSS is decreaded (node purity increased) on each split  
importance(rf.carseats)
##             IncNodePurity
## CompPrice      192.082314
## Income          92.591706
## Advertising    115.678146
## Population      94.699812
## Price          646.896851
## ShelveLoc      491.485512
## Age            228.380907
## Education       61.620468
## Urban            7.736238
## US              19.059774
varImpPlot(rf.carseats)

library(tidyverse)
library(tree)
library(randomForest)
library(dataPreparation)
options(warn = 1)
oj.df = read.csv("/Users/shahrdadshadab/env/my-R-project/ISLR/Data/datasets/orange_juice_withmissing.csv", 
                      header=T, stringsAsFactors = F, na.strings = "?")

oj.df <- tibble(oj.df)

#----------------------------------------
# a) split the data into train and test
# lets first split data to 70% train and the rest test 
set.seed(1113)
train.idx <- sample(1:nrow(oj.df), 0.7*nrow(oj.df))
test.idx <- setdiff(1:nrow(oj.df), train.idx)

train.df <- oj.df[train.idx, ]
test.df <- oj.df[test.idx, ]

# remove empty characters and NA helper
remove.NA.chars <- function(df)
  df %>%
  select_all %>%
  filter_if(is.character, all_vars(trimws(.) != "" & trimws(.) != "NA"))
  
# Clean up train data set
train.df <- 
  train.df %>%
  na.omit() %>%
  remove.NA.chars()

# remove constant variables
(constant_cols <- whichAreConstant(train.df))
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
## integer(0)
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(train.df))
## [1] "whichAreInDouble: it took me 0.02s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(train.df))
## [1] "whichAreBijection: it took me 0.11s to identify 0 column(s) to drop."
## integer(0)
train.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) train.df[- bijections_cols] else train.df

# Convert characters to numeric
train.df$PriceCH <- as.numeric(as.character(train.df$PriceCH))
train.df$PriceMM <- as.numeric(as.character(train.df$PriceMM))
train.df$DiscCH <- as.numeric(as.character(train.df$DiscCH))
train.df$DiscMM <- as.numeric(as.character(train.df$DiscMM))
train.df$SpecialCH <- as.numeric(as.character(train.df$SpecialCH))
train.df$SpecialMM <- as.numeric(as.character(train.df$SpecialMM))
train.df$LoyalCH <- as.numeric(as.character(train.df$LoyalCH))
train.df$SalePriceCH <- as.numeric(as.character(train.df$SalePriceCH))
train.df$SalePriceMM <- as.numeric(as.character(train.df$SalePriceMM))
train.df$PriceDiff <- as.numeric(as.character(train.df$PriceDiff))
train.df$PctDiscMM <- as.numeric(as.character(train.df$PctDiscMM))
train.df$PctDiscCH <- as.numeric(as.character(train.df$PctDiscCH))
train.df$StoreID <- as.numeric(as.character(train.df$StoreID))
train.df$STORE <- as.numeric(as.character(train.df$STORE))

# remove constant variables
constant_cols <- whichAreConstant(train.df)
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(train.df))
## [1] "whichAreInDouble: it took me 0s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(train.df))
## [1] "whichAreBijection: STORE is a bijection of StoreID. I put it in drop list."
## [1] "whichAreBijection: it took me 0.06s to identify 1 column(s) to drop."
## [1] 18
train.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) train.df[- bijections_cols] else train.df

# It looks Purchase has only two values "CH" and "MM"
table(train.df$Purchase)
## 
##  CH  MM 
## 444 278
# convert all character columns to factor
char.to.fctor <- function(df)
  df %>%
    mutate_if(is.character, ~ factor(.x, levels = (.x %>% table() %>% names())))


train.df <- char.to.fctor(train.df)
# Purchase is encoded by factor as CH -> 1, MM -> 2

head(train.df$Purchase)
## [1] MM CH CH CH MM MM
## Levels: CH MM
as.integer(head(train.df$Purchase))
## [1] 2 1 1 1 2 2
# --------- do similar clean up for test data ----------------

# remove empty characters and NA helper
remove.NA.chars <- function(df)
  df %>%
  select_all %>%
  filter_if(is.character, all_vars(trimws(.) != "" & trimws(.) != "NA"))
  
# Finally convert character columns to factor

# Clean up train data set
test.df <- 
  test.df %>%
  na.omit() %>%
  remove.NA.chars()


# remove constant variables
(constant_cols <- whichAreConstant(test.df))
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
## integer(0)
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(test.df))
## [1] "whichAreInDouble: it took me 0s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(test.df))
## [1] "whichAreBijection: it took me 0.09s to identify 0 column(s) to drop."
## integer(0)
test.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) test.df[- bijections_cols] else test.df

# Convert characters to numeric
test.df$PriceCH <- as.numeric(as.character(test.df$PriceCH))
test.df$PriceMM <- as.numeric(as.character(test.df$PriceMM))
test.df$DiscCH <- as.numeric(as.character(test.df$DiscCH))
test.df$DiscMM <- as.numeric(as.character(test.df$DiscMM))
test.df$SpecialCH <- as.numeric(as.character(test.df$SpecialCH))
test.df$SpecialMM <- as.numeric(as.character(test.df$SpecialMM))
test.df$LoyalCH <- as.numeric(as.character(test.df$LoyalCH))
test.df$SalePriceCH <- as.numeric(as.character(test.df$SalePriceCH))
test.df$SalePriceMM <- as.numeric(as.character(test.df$SalePriceMM))
test.df$PriceDiff <- as.numeric(as.character(test.df$PriceDiff))
test.df$PctDiscMM <- as.numeric(as.character(test.df$PctDiscMM))
test.df$PctDiscCH <- as.numeric(as.character(test.df$PctDiscCH))
test.df$StoreID <- as.numeric(as.character(test.df$StoreID))
test.df$STORE <- as.numeric(as.character(test.df$STORE))

# remove constant variables
constant_cols <- whichAreConstant(test.df)
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(test.df))
## [1] "whichAreInDouble: it took me 0.01s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(test.df))
## [1] "whichAreBijection: STORE is a bijection of StoreID. I put it in drop list."
## [1] "whichAreBijection: it took me 0.09s to identify 1 column(s) to drop."
## [1] 18
test.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) test.df[- bijections_cols] else test.df

# convert all character columns to factor
char.to.fctor <- function(df)
  df %>%
    mutate_if(is.character, ~ factor(.x, levels = (.x %>% table() %>% names())))

test.df <- char.to.fctor(test.df)
# Purchase is encoded by factor as CH -> 1, MM -> 2

head(test.df$Purchase)
## [1] CH MM CH CH CH CH
## Levels: CH MM
as.integer(head(test.df$Purchase))
## [1] 1 2 1 1 1 1
# --------------------------------------------------------------------------------
# b) Fit a classification tree , plot the three and interpret the results, testMSE
# Note since "Purchase" is a factor, tree() API construct a classification tree
tree.oj <- tree(Purchase ~ ., train.df)

# Number of terminal nodes:  7 
# Residual mean deviance:  0.7995
# Misclassification error rate: 0.177
summary(tree.oj)
## 
## Classification tree:
## tree(formula = Purchase ~ ., data = train.df)
## Variables actually used in tree construction:
## [1] "LoyalCH"       "PriceDiff"     "ListPriceDiff"
## Number of terminal nodes:  8 
## Residual mean deviance:  0.7261 = 518.4 / 714 
## Misclassification error rate: 0.162 = 117 / 722
# ----------------------------------
# c) the tree
tree.oj
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 722 962.40 CH ( 0.61496 0.38504 )  
##    2) LoyalCH < 0.50395 312 371.20 MM ( 0.28205 0.71795 )  
##      4) LoyalCH < 0.276142 146 100.90 MM ( 0.10959 0.89041 )  
##        8) LoyalCH < 0.035047 52   0.00 MM ( 0.00000 1.00000 ) *
##        9) LoyalCH > 0.035047 94  85.77 MM ( 0.17021 0.82979 ) *
##      5) LoyalCH > 0.276142 166 227.20 MM ( 0.43373 0.56627 )  
##       10) PriceDiff < 0.05 67  73.66 MM ( 0.23881 0.76119 ) *
##       11) PriceDiff > 0.05 99 135.50 CH ( 0.56566 0.43434 ) *
##    3) LoyalCH > 0.50395 410 319.50 CH ( 0.86829 0.13171 )  
##      6) PriceDiff < -0.39 26  30.29 MM ( 0.26923 0.73077 ) *
##      7) PriceDiff > -0.39 384 234.40 CH ( 0.90885 0.09115 )  
##       14) LoyalCH < 0.703993 117 118.70 CH ( 0.79487 0.20513 )  
##         28) ListPriceDiff < 0.235 40  54.55 CH ( 0.57500 0.42500 ) *
##         29) ListPriceDiff > 0.235 77  46.91 CH ( 0.90909 0.09091 ) *
##       15) LoyalCH > 0.703993 267  91.71 CH ( 0.95880 0.04120 ) *
# feature split criterion: LoyalCH < 0.0356415 
# Number of training data observations in the branch: 44   
# deviance : 0.00 
# prediction for the branch : MM 
# Fractions of observations classification for thr branch ( 0.00000 1.00000 )

#d) plot the tree
# looks the decision is mostly based on differen ranges on LoyalCH and PriceDiff
# not much other predictors matter in decision making
plot(tree.oj)
text(tree.oj, pretty = 0)

# --------------------------
# e) predoiction on test data 
# --------------------------


# pred.probs generates below data:
#            CH         MM
# 1   0.5750000 0.42500000
# 2   0.2388060 0.76119403
# 3   0.9588015 0.04119850

pred.probs <- predict(tree.oj, newdata = test.df)

pred.values <- ifelse(pred.probs[, 1] > pred.probs[, 2], "CH", "MM")

obs.values <- test.df$Purchase

(confusion_table <- table(pred.values, obs.values))
##            obs.values
## pred.values  CH  MM
##          CH 171  34
##          MM  20  83
# TN: observes as 0 and predicted as 0 
true.negative <- confusion_table[1,1]

# TP: observes as 1 and predicted as 1 
true.positive <- confusion_table[2,2]

# FP: observes as 0 and predicted as 1 
false.positive <- confusion_table[2,1]

# FN: observed as 1 and predicted as 0 
false.negative <- confusion_table[1,2]

observations.total <- true.negative + false.negative + true.positive + false.positive
observed.as.0 <- true.negative + false.positive
observed.as.1 <- false.negative + true.positive

# Random guessing Rate
(nullClassifier <- max(observed.as.0 / observations.total, observed.as.1 / observations.total))
## [1] 0.6201299
# classification Rate:
(classification.rate <- mean(pred.values == obs.values))
## [1] 0.8246753
# Test error Rate:
(missclassificationRate <- 1 - classification.rate)
## [1] 0.1753247
# False Positive Rate (or: Type I error , 1 - specificty):
(FP_rates <- false.positive / observed.as.0)
## [1] 0.104712
# True Positive Rate (or: 1 - Type II error , power , sesetivity , recall):
(TP_rates <- true.positive / observed.as.1)
## [1] 0.7094017
# Precision (Accuracy Rate, 1 - false discovery proportion):
# Model predicted that Two people make a purchase but in reality only one did (50%)
(precisions <- true.positive / (true.positive + false.positive))
## [1] 0.8058252
# Specificity
(specificities <-  1 - false.positive/(false.positive + true.negative))
## [1] 0.895288
library(pROC)
pred.probs.roc <- ifelse(pred.probs[, 1] > pred.probs[, 2], pred.probs[, 1], pred.probs[, 2])
roc_obj <- roc(unlist(obs.values), pred.probs.roc)
## Setting levels: control = CH, case = MM
## Setting direction: controls > cases
# Let's draw some AUC plots
plot(roc_obj, legacy.axes = TRUE)

# ------------------------------------
# f) apply cv.tree to prune the tree
# ------------------------------------
cv.oj <- cv.tree(object = tree.oj, FUN = prune.misclass, K = 10)

# Number of terminal nodes of each tree that CV considered
cv.oj$size
## [1] 8 5 3 2 1
# Error rate corresponding to each tree that CV considered
cv.oj$dev
## [1] 135 135 134 144 278
# value of cost complexity parameter alpha that corresponds to each tree considered by CV
cv.oj$k
## [1]  -Inf   0.0   6.5  12.0 136.0
# -------------------------------------------------
# g) plot tree size vs CV classification error rate
# --------------------------------------------------
# plot the missclassification error rate as the function of size and k 
plot(cv.oj$size, cv.oj$dev, type = 'b')

plot(cv.oj$k, cv.oj$dev, type = 'b')

# ---------------------------------------------------------------------------------------
# h) seems like 3 is the best size of the tree,
# ---------------------------------------------------------------------------------------

#--------------------------------------
# i)  let's prune it to get to tree with size 3
# ---------------------------------------------
prune.oj <- prune.misclass(tree.oj, best = 3)



plot(prune.oj)
text(prune.oj, pretty = 0)

# let's run it on test set to see the ,iss classification rate again:

#---------------------------------------------------------------------
# j) compare training error rates between prune and unpruned tree
# --------------------------------------------------------------------

#pruned tree
summary(prune.oj)
## 
## Classification tree:
## snip.tree(tree = tree.oj, nodes = c(7L, 2L))
## Variables actually used in tree construction:
## [1] "LoyalCH"   "PriceDiff"
## Number of terminal nodes:  3 
## Residual mean deviance:  0.8844 = 635.9 / 719 
## Misclassification error rate: 0.1801 = 130 / 722
#---------------------------------------------------------------------
# j) compare test error rates between prune and unpruned tree
# --------------------------------------------------------------------

# pred.probs generates below data:
#            CH         MM
# 1   0.5750000 0.42500000
# 2   0.2388060 0.76119403
# 3   0.9588015 0.04119850

pred.probs <- predict(prune.oj, newdata = test.df)

pred.values <- ifelse(pred.probs[, 1] > pred.probs[, 2], "CH", "MM")

obs.values <- test.df$Purchase

pred.probs <- predict(tree.oj, newdata = test.df)

(confusion_table <- table(pred.values, obs.values))
##            obs.values
## pred.values  CH  MM
##          CH 145  19
##          MM  46  98
# TN: observes as 0 and predicted as 0 
true.negative <- confusion_table[1,1]

# TP: observes as 1 and predicted as 1 
true.positive <- confusion_table[2,2]

# FP: observes as 0 and predicted as 1 
false.positive <- confusion_table[2,1]

# FN: observed as 1 and predicted as 0 
false.negative <- confusion_table[1,2]

observations.total <- true.negative + false.negative + true.positive + false.positive
observed.as.0 <- true.negative + false.positive
observed.as.1 <- false.negative + true.positive

# Random guessing Rate
(nullClassifier <- max(observed.as.0 / observations.total, observed.as.1 / observations.total))
## [1] 0.6201299
# classification Rate:
(classification.rate <- mean(pred.values == obs.values))
## [1] 0.788961
# Test error Rate:
(missclassificationRate <- 1 - classification.rate)
## [1] 0.211039
# False Positive Rate (or: Type I error , 1 - specificty):
(FP_rates <- false.positive / observed.as.0)
## [1] 0.2408377
# True Positive Rate (or: 1 - Type II error , power , sesetivity , recall):
(TP_rates <- true.positive / observed.as.1)
## [1] 0.8376068
# Precision (Accuracy Rate, 1 - false discovery proportion):
# Model predicted that Two people make a purchase but in reality only one did (50%)
(precisions <- true.positive / (true.positive + false.positive))
## [1] 0.6805556
# Specificity
(specificities <-  1 - false.positive/(false.positive + true.negative))
## [1] 0.7591623
library(pROC)
pred.probs.roc <- ifelse(pred.probs[, 1] > pred.probs[, 2], pred.probs[, 1], pred.probs[, 2])

roc_obj <- roc(unlist(obs.values), pred.probs.roc)
## Setting levels: control = CH, case = MM
## Setting direction: controls > cases
# Let's draw some AUC plots
plot(roc_obj, legacy.axes = TRUE)

#  pruned tree classifier behaves worse than the full tree:
library(tidyverse)
library(gbm)
library(glmnet) # for Lasso
## Loaded glmnet 3.0-2
library(randomForest)
library(dataPreparation)
options(warn = 1)

set.seed(1311)

hitters.df = read.csv(
  "/Users/shahrdadshadab/env/my-R-project/ISLR/Data/datasets/Hitters.csv", 
  header=T, stringsAsFactors = F, na.strings = "?")

# -------------------------------
# 

# note that Salary is of type string and some of them are NA
sum(hitters.df$Salary=="NA")
## [1] 59
hitters.df <- tibble(hitters.df)

# b) first devide the data to 70% train and 30% test
n <- nrow(hitters.df)
no.of.train <- ceiling(0.7*n)
no.of.test <- n - no.of.train

train.idx <- sample(seq_len(n), no.of.train)
test.idx <- setdiff(seq_len(n), train.idx)

train.df <- hitters.df[train.idx, ]
test.df <- hitters.df[test.idx, ]

# a) Cleaning and log transform

#So let's clean train data set

na.train.idx <- 
  train.df %>%
  pmap(~c(...)) %>%
  map(~sum(is.na(.) | str_trim(.) == "NA") > 0) %>%
  unlist %>%
  which()

rest.train.idx <- setdiff(seq_len(no.of.train) , na.train.idx)

train.df <- train.df[rest.train.idx, ]

# remove constant variables
constant_cols <- whichAreConstant(train.df)
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(train.df))
## [1] "whichAreInDouble: it took me 0s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(train.df))
## [1] "whichAreBijection: League is a bijection of NewLeague. I put it in drop list."
## [1] "whichAreBijection: it took me 0.1s to identify 1 column(s) to drop."
## [1] 14
train.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) train.df[- bijections_cols] else train.df

train.df$Salary <- log(as.numeric(as.character(train.df$Salary)))

# convert all character columns to factor
char.to.fctor <- function(df)
  df %>%
    mutate_if(is.character, ~ factor(.x, levels = (.x %>% table() %>% names())))

train.df <- char.to.fctor(train.df)

# Now clean test data

na.test.idx <- 
  test.df %>%
  pmap(~c(...)) %>%
  map(~sum(is.na(.) | str_trim(.) == "NA") > 0) %>%
  unlist %>%
  which()

rest.test.idx <- setdiff(seq_len(no.of.test) , na.test.idx)

test.df <- test.df[rest.test.idx, ]

# remove constant variables
constant_cols <- whichAreConstant(test.df)
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(test.df))
## [1] "whichAreInDouble: it took me 0s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(test.df))
## [1] "whichAreBijection: League is a bijection of NewLeague. I put it in drop list."
## [1] "whichAreBijection: it took me 0.12s to identify 1 column(s) to drop."
## [1] 14
test.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) test.df[- bijections_cols] else test.df


test.df$Salary <- log(as.numeric(as.character(test.df$Salary)))

# convert all character columns to factor
char.to.fctor <- function(df)
  df %>%
    mutate_if(is.character, ~ factor(.x, levels = (.x %>% table() %>% names())))

test.df <- char.to.fctor(test.df)


# c) perform Regression boosting on training sets with 1000 trees for range of shrinkage parameters

shrinkage.MSE.train.plot <- tibble(lambdas = NULL, train.MSEs = NULL, DataSet=NULL)
shrinkage.MSE.test.plot <- tibble(lambdas = NULL, test.MSEs = NULL, r.squared = NULL, DataSet=NULL)

for (lambda in seq(0.1, 1, len = 100)){
  hitters.boost <- gbm(Salary ~ ., data = train.df, distribution="gaussian", 
                     n.trees = 1000, shrinkage=lambda, verbose=F)
 
  # claculate training MSE
  yhat.boost.train <- predict(hitters.boost, newdata = train.df, n.trees = 1000)
  stopifnot(identical (length(yhat.boost.train) , length(train.df$Salary)) )

  
  shrinkage.MSE.train.plot <- rbind(shrinkage.MSE.train.plot , 
                   tibble(lambdas = lambda,
                          train.MSEs = mean((yhat.boost.train - train.df$Salary)^2),
                          DataSet = "train"))
  
  
  
  # claculate test MSE
  yhat.boost.test <- predict(hitters.boost, newdata = test.df, n.trees = 1000)
  stopifnot(identical (length(yhat.boost.test) , length(test.df$Salary)) )


  shrinkage.MSE.test.plot <- rbind(shrinkage.MSE.test.plot , 
                   tibble(lambdas = lambda,
                          test.MSEs = mean((yhat.boost.test - test.df$Salary)^2),
                          r.squared = round((cor(yhat.boost.test, test.df$Salary) ^ 2), 3),
                          DataSet = "test"))
  

}


ggplot() + 
        # plot test samples
        geom_line(data = shrinkage.MSE.train.plot,
            aes(x = lambdas, y = train.MSEs, color = DataSet)) +
  labs(title="Shinkage factor vs. train MSE")

ggplot() + 
        # plot test samples
        geom_line(data = shrinkage.MSE.test.plot,
            aes(x = lambdas, y = test.MSEs, color = DataSet)) +
  labs(title="Shinkage factor vs. test MSE")

# e) Compare test.mse of boosing with test.mse of linear model and lasso 

# find which lambda value causes minimum test.mse
shrinkage.MSE.test.plot %>% slice(which.min(test.MSEs))
# build a linear model on training data and find the test mse
hitters.lm = lm(Salary ~ ., data = train.df)
preds <- predict(hitters.lm,  test.df, interval = "prediction")
yhat.lm.test <- preds[,"fit"]
stopifnot(identical (length(yhat.lm.test), length(test.df$Salary)) )

print ("Apply Linear regression:")
## [1] "Apply Linear regression:"
# LM test MSE
(test.MSE = mean((yhat.lm.test - test.df$Salary)^2))
## [1] 0.4539836
# LM R squared
(r.squared = round((cor(yhat.lm.test, test.df$Salary) ^ 2), 3))
## [1] 0.396
# Clearly performance of linear model cannot beat boosing model


print ("Apply Lasso  cross validation to find the best lambda and corresponding coeffs")
## [1] "Apply Lasso  cross validation to find the best lambda and corresponding coeffs"
# First construct matrix from dataframe (and drop intercept column)

x.train <- model.matrix(Salary~., train.df)[, -1] # drop the intercept
x.test <- model.matrix(Salary~., test.df)[, -1] # drop the intercept
y.train <- train.df$Salary
y.test <- test.df$Salary

cv.out <- cv.glmnet(x.train, y.train, alpha=1) # Lasso
plot(cv.out)

best.lambda <- cv.out$lambda.min

# predict the model on test
pred.lasso <- predict(cv.out, s=best.lambda, newx=x.test)

# Lasso test mse for the best lambda we chose:
(lasso.mse <- mean((pred.lasso - y.test)^2))
## [1] 0.4065154
# R squared for the best lambda we chose:
(lasso.R.squared <- round((cor(pred.lasso, y.test) ^ 2), 3))
##    [,1]
## 1 0.427
# f) Most important predictors in boostnig model:

summary (hitters.boost)
# it to what predictors lasso chose:
# Coeffs for Lasso:
(coefs.lasso <- predict (cv.out, type = "coefficients", s = best.lambda))
## 20 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept)  4.628897e+00
## AtBat       -2.083146e-03
## Hits         9.695646e-03
## HmRun        3.981258e-03
## Runs         7.700172e-04
## RBI          .           
## Walks        8.552672e-03
## Years        4.815029e-02
## CAtBat       3.640107e-05
## CHits        .           
## CHmRun       9.260731e-04
## CRuns        3.665106e-04
## CRBI         .           
## CWalks       .           
## LeagueN      1.923153e-01
## DivisionW   -2.065707e-01
## PutOuts      3.612672e-04
## Assists      7.686914e-04
## Errors      -1.043280e-02
## NewLeagueN  -1.992285e-01
# It is noticable that "CHits" predictor which is the most important 
# predictor in boosting model is dropped by Lasso

# g) apply bagging to training set and calculate testMSE and R squared
hitters.bag = randomForest(Salary ~ ., data = train.df, mtry=ncol(train.df) - 1, importance=T)
hitters.bag
## 
## Call:
##  randomForest(formula = Salary ~ ., data = train.df, mtry = ncol(train.df) -      1, importance = T) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 19
## 
##           Mean of squared residuals: 0.1724127
##                     % Var explained: 79.67
yhat.bag <- predict(hitters.bag, newdata = test.df)

# bagging test mse:
(bag.mse <- mean((yhat.bag - test.df$Salary)^2))
## [1] 0.2377224
# R squared for testMSE:
(bag.R.squared <- round((cor(yhat.bag, test.df$Salary) ^ 2), 3))
## [1] 0.637
# surprisingly bagging produced better result than gradiant boosting
library(tidyverse)
library(gbm)
library(glmnet) # for Lasso
library(randomForest)
library(dataPreparation)
library(class)
options(warn = 1)
set.seed(1113)
caravan.df = read.csv("/Users/shahrdadshadab/env/my-R-project/ISLR/Data/Caravan.csv", header=T, 
                      stringsAsFactors = F, na.strings = "?")

caravan.df = as_tibble(caravan.df)


# remove constant variables
constant_cols <- whichAreConstant(caravan.df)
## [1] "whichAreConstant: it took me 0.01s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(caravan.df))
## [1] "whichAreInDouble: it took me 0.1s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(caravan.df))
## [1] "whichAreBijection: it took me 1.95s to identify 0 column(s) to drop."
## integer(0)
caravan.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) caravan.df[- bijections_cols] else caravan.df

nrow(caravan.df)
## [1] 5822
caravan.df %>% 
   filter_all(all_vars(is.na(.))) 
# No NA is in data set 

# Since boosting API with 'benoulli' distribution needs 0 and 1 values
# we cannot use facotr, instead we should manually encode them
caravan.df$Purchase <- ifelse(caravan.df$Purchase == "Yes", 1, 0)
str(caravan.df)
## Classes 'data.table' and 'data.frame':   5822 obs. of  87 variables:
##  $ X       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ MOSTYPE : int  33 37 37 9 40 23 39 33 33 11 ...
##  $ MAANTHUI: int  1 1 1 1 1 1 2 1 1 2 ...
##  $ MGEMOMV : int  3 2 2 3 4 2 3 2 2 3 ...
##  $ MGEMLEEF: int  2 2 2 3 2 1 2 3 4 3 ...
##  $ MOSHOOFD: int  8 8 8 3 10 5 9 8 8 3 ...
##  $ MGODRK  : int  0 1 0 2 1 0 2 0 0 3 ...
##  $ MGODPR  : int  5 4 4 3 4 5 2 7 1 5 ...
##  $ MGODOV  : int  1 1 2 2 1 0 0 0 3 0 ...
##  $ MGODGE  : int  3 4 4 4 4 5 5 2 6 2 ...
##  $ MRELGE  : int  7 6 3 5 7 0 7 7 6 7 ...
##  $ MRELSA  : int  0 2 2 2 1 6 2 2 0 0 ...
##  $ MRELOV  : int  2 2 4 2 2 3 0 0 3 2 ...
##  $ MFALLEEN: int  1 0 4 2 2 3 0 0 3 2 ...
##  $ MFGEKIND: int  2 4 4 3 4 5 3 5 3 2 ...
##  $ MFWEKIND: int  6 5 2 4 4 2 6 4 3 6 ...
##  $ MOPLHOOG: int  1 0 0 3 5 0 0 0 0 0 ...
##  $ MOPLMIDD: int  2 5 5 4 4 5 4 3 1 4 ...
##  $ MOPLLAAG: int  7 4 4 2 0 4 5 6 8 5 ...
##  $ MBERHOOG: int  1 0 0 4 0 2 0 2 1 2 ...
##  $ MBERZELF: int  0 0 0 0 5 0 0 0 1 0 ...
##  $ MBERBOER: int  1 0 0 0 4 0 0 0 0 0 ...
##  $ MBERMIDD: int  2 5 7 3 0 4 4 2 1 3 ...
##  $ MBERARBG: int  5 0 0 1 0 2 1 5 8 3 ...
##  $ MBERARBO: int  2 4 2 2 0 2 5 2 1 3 ...
##  $ MSKA    : int  1 0 0 3 9 2 0 2 1 1 ...
##  $ MSKB1   : int  1 2 5 2 0 2 1 1 1 2 ...
##  $ MSKB2   : int  2 3 0 1 0 2 4 2 0 1 ...
##  $ MSKC    : int  6 5 4 4 0 4 5 5 8 4 ...
##  $ MSKD    : int  1 0 0 0 0 2 0 2 1 2 ...
##  $ MHHUUR  : int  1 2 7 5 4 9 6 0 9 0 ...
##  $ MHKOOP  : int  8 7 2 4 5 0 3 9 0 9 ...
##  $ MAUT1   : int  8 7 7 9 6 5 8 4 5 6 ...
##  $ MAUT2   : int  0 1 0 0 2 3 0 4 2 1 ...
##  $ MAUT0   : int  1 2 2 0 1 3 1 2 3 2 ...
##  $ MZFONDS : int  8 6 9 7 5 9 9 6 7 6 ...
##  $ MZPART  : int  1 3 0 2 4 0 0 3 2 3 ...
##  $ MINKM30 : int  0 2 4 1 0 5 4 2 7 2 ...
##  $ MINK3045: int  4 0 5 5 0 2 3 5 2 3 ...
##  $ MINK4575: int  5 5 0 3 9 3 3 3 1 3 ...
##  $ MINK7512: int  0 2 0 0 0 0 0 0 0 1 ...
##  $ MINK123M: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MINKGEM : int  4 5 3 4 6 3 3 3 2 4 ...
##  $ MKOOPKLA: int  3 4 4 4 3 3 5 3 3 7 ...
##  $ PWAPART : int  0 2 2 0 0 0 0 0 0 2 ...
##  $ PWABEDR : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PWALAND : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PPERSAUT: int  6 0 6 6 0 6 6 0 5 0 ...
##  $ PBESAUT : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PMOTSCO : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PVRAAUT : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PAANHANG: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PTRACTOR: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PWERKT  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PBROM   : int  0 0 0 0 0 0 0 3 0 0 ...
##  $ PLEVEN  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PPERSONG: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PGEZONG : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PWAOREG : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PBRAND  : int  5 2 2 2 6 0 0 0 0 3 ...
##  $ PZEILPL : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PPLEZIER: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PFIETS  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PINBOED : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PBYSTAND: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AWAPART : int  0 2 1 0 0 0 0 0 0 1 ...
##  $ AWABEDR : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AWALAND : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ APERSAUT: int  1 0 1 1 0 1 1 0 1 0 ...
##  $ ABESAUT : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AMOTSCO : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AVRAAUT : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AAANHANG: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ATRACTOR: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AWERKT  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ABROM   : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ ALEVEN  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ APERSONG: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AGEZONG : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AWAOREG : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ABRAND  : int  1 1 1 1 1 0 0 0 0 1 ...
##  $ AZEILPL : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ APLEZIER: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AFIETS  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AINBOED : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ABYSTAND: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Purchase: num  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>
# a) let's creata 1000 observations in training set and the rest in test set:

(n <- nrow(caravan.df))
## [1] 5822
train.idx <- sample(seq_len(n), 1000)
test.idx <- setdiff(seq_len(n), train.idx)

stopifnot(identical(length(train.idx) + length(test.idx), n))

train.df <- caravan.df[train.idx, ]
test.df <- caravan.df[test.idx, ]



# b) Fit a boosting model and find the most important variable
caravan.boost <- gbm(Purchase ~ . , data = train.df, distribution="bernoulli", 
                   n.trees = 1000, shrinkage=0.01, verbose=F)
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution, :
## variable 61: PZEILPL has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution, :
## variable 82: AZEILPL has no variation.
# MSKC is the most important predictor
summary(caravan.boost)
# c) predict that a person will make a purchase if estimated probability of purchase > 0.2
prob.boost.test <- predict(caravan.boost, newdata = test.df, n.trees = 1000)

# Since we encoded Yes as 1 and No as 0 we have
yhat.boost.test <- ifelse(prob.boost.test > 0.2, 1, 0)

stopifnot(identical(length(prob.boost.test),length(test.df$Purchase)))

#Confusion Matrix:
(confusion_table <- table(yhat.boost.test, test.df$Purchase))
##                
## yhat.boost.test    0    1
##               0 4524  296
##               1    1    1
# TN: observes as 0 and predicted as 0 (No of people that predicted Not to make any purchase and they really did not)
true.negative <- confusion_table[1,1]

# TP: observes as 1 and predicted as 1 (No of people that predicted to make a purchase and in reality they did)
true.positive <- confusion_table[2,2]

# FP: observes as 0 and predicted as 1 (No of people that predicted to make a purchase but they did not)
false.positive <- confusion_table[2,1]

# FN: observed as 1 and predicted as 0 (No of people that predicted they did not make a purchase but they in realty did)
false.negative <- confusion_table[1,2]

observations.total <- true.negative + false.negative + true.positive + false.positive
observed.as.0 <- true.negative + false.positive
observed.as.1 <- false.negative + true.positive

# Random guessing Rate
(nullClassifier <- max(observed.as.0 / observations.total, observed.as.1 / observations.total))
## [1] 0.9384073
# classification Rate:
(classification.rate <- mean(yhat.boost.test == test.df$Purchase))
## [1] 0.9384073
# Test error Rate:
(missclassificationRate <- 1 - classification.rate)
## [1] 0.0615927
# False Positive Rate (or: Type I error , 1 - specificty):
(FP_rates <- false.positive / observed.as.0)
## [1] 0.0002209945
# True Positive Rate (or: 1 - Type II error , power , sesetivity , recall):
(TP_rates <- true.positive / observed.as.1)
## [1] 0.003367003
# Precision (Accuracy Rate, 1 - false discovery proportion):
# Model predicted that Two people make a purchase but in reality only one did (50%)
(precisions <- true.positive / (true.positive + false.positive))
## [1] 0.5
# Specificity
(specificities <-  1 - false.positive/(false.positive + true.negative))
## [1] 0.999779
library(pROC)
roc_obj <- roc(unlist(test.df$Purchase), prob.boost.test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Let's draw some AUC plots
plot(roc_obj, legacy.axes = TRUE)

# -------------------------------------------------------------
# Let's use logiostic regression to estimate what percentage of 
# people who predicted to make a purchase and they really did
glm.model <- glm(Purchase ~ . , data = train.df, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm.model)
## 
## Call:
## glm(formula = Purchase ~ ., family = binomial, data = train.df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5983  -0.2918  -0.1463  -0.0555   3.3308  
## 
## Coefficients: (3 not defined because of singularities)
##               Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  2.645e+02  6.732e+04   0.004   0.9969  
## X            3.404e-05  1.043e-04   0.327   0.7440  
## MOSTYPE      1.421e-01  1.327e-01   1.070   0.2844  
## MAANTHUI    -9.558e-01  7.816e-01  -1.223   0.2214  
## MGEMOMV      2.900e-01  4.152e-01   0.698   0.4849  
## MGEMLEEF     6.166e-01  3.394e-01   1.817   0.0692 .
## MOSHOOFD    -7.014e-01  5.946e-01  -1.180   0.2382  
## MGODRK      -1.032e-01  3.303e-01  -0.312   0.7548  
## MGODPR       4.260e-02  3.535e-01   0.121   0.9041  
## MGODOV      -3.587e-02  3.140e-01  -0.114   0.9091  
## MGODGE      -1.753e-01  3.260e-01  -0.538   0.5907  
## MRELGE       3.549e-01  4.685e-01   0.757   0.4488  
## MRELSA       4.875e-01  4.441e-01   1.098   0.2724  
## MRELOV       5.770e-01  4.565e-01   1.264   0.2063  
## MFALLEEN    -2.331e-02  3.901e-01  -0.060   0.9524  
## MFGEKIND     1.634e-01  4.040e-01   0.404   0.6859  
## MFWEKIND     9.659e-02  4.198e-01   0.230   0.8180  
## MOPLHOOG    -3.250e-01  3.791e-01  -0.857   0.3912  
## MOPLMIDD    -3.506e-01  4.118e-01  -0.852   0.3945  
## MOPLLAAG    -6.335e-01  4.154e-01  -1.525   0.1273  
## MBERHOOG     1.507e-01  2.906e-01   0.519   0.6039  
## MBERZELF     2.793e-01  3.195e-01   0.874   0.3821  
## MBERBOER     1.331e-01  3.017e-01   0.441   0.6592  
## MBERMIDD     3.781e-01  2.652e-01   1.426   0.1539  
## MBERARBG     2.778e-01  2.721e-01   1.021   0.3072  
## MBERARBO     3.859e-01  2.707e-01   1.426   0.1539  
## MSKA         5.966e-01  3.372e-01   1.769   0.0769 .
## MSKB1        3.270e-01  3.041e-01   1.075   0.2823  
## MSKB2        2.533e-01  2.700e-01   0.938   0.3482  
## MSKC         5.236e-01  3.104e-01   1.687   0.0917 .
## MSKD         4.964e-01  2.958e-01   1.678   0.0934 .
## MHHUUR      -1.506e+01  5.107e+03  -0.003   0.9976  
## MHKOOP      -1.499e+01  5.107e+03  -0.003   0.9977  
## MAUT1       -3.144e-01  4.426e-01  -0.710   0.4775  
## MAUT2       -2.313e-01  3.972e-01  -0.582   0.5604  
## MAUT0       -3.000e-01  4.414e-01  -0.680   0.4967  
## MZFONDS     -1.546e+01  5.465e+03  -0.003   0.9977  
## MZPART      -1.548e+01  5.465e+03  -0.003   0.9977  
## MINKM30     -2.913e-01  3.047e-01  -0.956   0.3390  
## MINK3045    -8.679e-02  2.964e-01  -0.293   0.7697  
## MINK4575    -3.181e-01  3.008e-01  -1.058   0.2902  
## MINK7512    -1.737e-01  2.935e-01  -0.592   0.5539  
## MINK123M    -2.350e-01  4.364e-01  -0.539   0.5902  
## MINKGEM      1.818e-01  2.678e-01   0.679   0.4972  
## MKOOPKLA     1.381e-01  1.461e-01   0.946   0.3443  
## PWAPART      1.792e+01  2.584e+03   0.007   0.9945  
## PWABEDR     -1.807e+00  4.826e+03   0.000   0.9997  
## PWALAND     -3.753e+01  3.604e+03  -0.010   0.9917  
## PPERSAUT     1.238e-01  1.374e-01   0.901   0.3675  
## PBESAUT      5.051e+00  4.702e+03   0.001   0.9991  
## PMOTSCO      5.868e-01  1.061e+00   0.553   0.5802  
## PVRAAUT     -3.158e+00  2.955e+03  -0.001   0.9991  
## PAANHANG    -1.459e+00  9.047e+03   0.000   0.9999  
## PTRACTOR    -1.164e+01  9.330e+02  -0.012   0.9900  
## PWERKT      -5.459e+01  1.941e+04  -0.003   0.9978  
## PBROM       -2.385e+00  1.783e+00  -1.338   0.1810  
## PLEVEN       9.380e-02  2.555e-01   0.367   0.7135  
## PPERSONG     6.381e-01  5.150e+03   0.000   0.9999  
## PGEZONG      1.966e+01  9.139e+03   0.002   0.9983  
## PWAOREG     -2.737e+00  4.234e+03  -0.001   0.9995  
## PBRAND       3.877e-01  2.050e-01   1.891   0.0586 .
## PZEILPL             NA         NA      NA       NA  
## PPLEZIER     1.173e+00  1.250e+04   0.000   0.9999  
## PFIETS      -2.387e-01  2.819e+00  -0.085   0.9325  
## PINBOED      8.740e+00  9.541e+03   0.001   0.9993  
## PBYSTAND    -1.609e+00  1.244e+00  -1.293   0.1960  
## AWAPART     -3.558e+01  5.168e+03  -0.007   0.9945  
## AWABEDR     -1.273e+01  1.151e+04  -0.001   0.9991  
## AWALAND      5.998e+01  9.722e+03   0.006   0.9951  
## APERSAUT     4.426e-01  6.197e-01   0.714   0.4751  
## ABESAUT     -2.405e+01  1.604e+04  -0.001   0.9988  
## AMOTSCO     -2.733e+00  4.816e+00  -0.568   0.5704  
## AVRAAUT             NA         NA      NA       NA  
## AAANHANG    -1.490e+01  1.535e+04  -0.001   0.9992  
## ATRACTOR     3.659e+01  2.799e+03   0.013   0.9896  
## AWERKT       9.620e+01  3.767e+04   0.003   0.9980  
## ABROM        5.302e+00  3.583e+00   1.480   0.1389  
## ALEVEN      -1.476e-01  7.088e-01  -0.208   0.8351  
## APERSONG    -1.743e+01  1.643e+04  -0.001   0.9992  
## AGEZONG     -5.760e+01  2.742e+04  -0.002   0.9983  
## AWAOREG     -1.288e+00  2.192e+04   0.000   1.0000  
## ABRAND      -4.226e-01  7.520e-01  -0.562   0.5741  
## AZEILPL             NA         NA      NA       NA  
## APLEZIER    -2.323e+01  4.495e+04  -0.001   0.9996  
## AFIETS       1.134e+00  1.870e+00   0.606   0.5442  
## AINBOED     -2.485e+01  1.558e+04  -0.002   0.9987  
## ABYSTAND     6.640e+00  4.463e+00   1.488   0.1368  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 402.90  on 999  degrees of freedom
## Residual deviance: 287.16  on 916  degrees of freedom
## AIC: 455.16
## 
## Number of Fisher Scoring iterations: 19
# do the prediction using the model
preds <- predict(glm.model, newdata=test.df, se = T)
## Warning in predict.lm(object, newdata, se.fit, scale = residual.scale, type = if
## (type == : prediction from a rank-deficient fit may be misleading
# Note that predict function for glm model gets the prediction for logit i.e: 
# log(Pr(Y=1 | X) / (1 - Pr(Y=1 | X))) = X*beta and also the standard error is 
# of the same form, clealry to get prediction for Pr(Y=1|X) we need 
# to calculate exp(X*beta)/(1+exp(X*beta))

prob.predict <- exp(preds$fit)/(1+exp(preds$fit))
se.bound.logit <- cbind(preds$fit + 2*preds$se, preds$fit - 2*preds$se)
se.bound <- apply(se.bound.logit, 2, function(x) exp(x)/(1+exp(x)))

yhat.logReg.test <- ifelse(prob.predict > 0.2, 1, 0)

stopifnot(identical(length(prob.predict),length(test.df$Purchase)))

#Confusion Matrix:
(confusion_table <- table(yhat.logReg.test, test.df$Purchase))
##                 
## yhat.logReg.test    0    1
##                0 4214  240
##                1  311   57
# TN: observes as 0 and predicted as 0 (No of people that predicted Not to make any purchase and they really did not)
true.negative <- confusion_table[1,1]

# TP: observes as 1 and predicted as 1 (No of people that predicted to make a purchase and in reality they did)
true.positive <- confusion_table[2,2]

# FP: observes as 0 and predicted as 1 (No of people that predicted to make a purchase but they did not)
false.positive <- confusion_table[2,1]

# FN: observed as 1 and predicted as 0 (No of people that predicted they did not make a purchase but they in realty did)
false.negative <- confusion_table[1,2]

observations.total <- true.negative + false.negative + true.positive + false.positive
observed.as.0 <- true.negative + false.positive
observed.as.1 <- false.negative + true.positive

# Random guessing Rate
(nullClassifier <- max(observed.as.0 / observations.total, observed.as.1 / observations.total))
## [1] 0.9384073
# classification Rate:
(classification.rate <- mean(yhat.logReg.test == test.df$Purchase))
## [1] 0.8857321
# Test error Rate:
(missclassificationRate <- 1 - classification.rate)
## [1] 0.1142679
# False Positive Rate (or: Type I error , 1 - specificty):
(FP_rates <- false.positive / observed.as.0)
## [1] 0.06872928
# True Positive Rate (or: 1 - Type II error , power , sesetivity , recall):
(TP_rates <- true.positive / observed.as.1)
## [1] 0.1919192
# Precision (Accuracy Rate, 1 - false discovery proportion):
# Model predicted that 311+57 people make a purchase but in reality only 57 people did (15%)
(precisions <- true.positive / (true.positive + false.positive))
## [1] 0.1548913
# Specificity
(specificities <-  1 - false.positive/(false.positive + true.negative))
## [1] 0.9312707
library(pROC)
roc_obj <- roc(unlist(test.df$Purchase), yhat.logReg.test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Let's draw some AUC plots
plot(roc_obj, legacy.axes = TRUE)

# Precision drop down from 50% in gradiant boosting to 15% in logistic regression 
library(tidyverse)
library(gbm)      # for original implementation of regular and stochastic GBMs
library(h2o)      # for a java-based implementation of GBM variants
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## The following object is masked from 'package:pROC':
## 
##     var
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
library(xgboost)  # for fitting extreme gradient boosting
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
library(glmnet) # for Lasso
library(randomForest)
library(dataPreparation)
library(class)
library(doParallel)  # for parallel backend to foreach
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loading required package: iterators
## Loading required package: parallel
library(foreach)     # for parallel processing with for loops

# Modeling packages
library(caret)       # for general model fitting
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(rpart)       # for fitting decision trees
library(ipred)   


options(warn = 1)
set.seed(1113)


smarket.df = 
  read.csv("/Users/shahrdadshadab/env/my-R-project/ISLR/Data/Smarket.csv", 
           header=T, stringsAsFactors = F, na.strings = "?")

str(smarket.df)
## 'data.frame':    1250 obs. of  10 variables:
##  $ X        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Year     : int  2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
##  $ Lag1     : num  0.381 0.959 1.032 -0.623 0.614 ...
##  $ Lag2     : num  -0.192 0.381 0.959 1.032 -0.623 ...
##  $ Lag3     : num  -2.624 -0.192 0.381 0.959 1.032 ...
##  $ Lag4     : num  -1.055 -2.624 -0.192 0.381 0.959 ...
##  $ Lag5     : num  5.01 -1.055 -2.624 -0.192 0.381 ...
##  $ Volume   : num  1.19 1.3 1.41 1.28 1.21 ...
##  $ Today    : num  0.959 1.032 -0.623 0.614 0.213 ...
##  $ Direction: chr  "Up" "Up" "Down" "Up" ...
# remove constant variables
constant_cols <- whichAreConstant(smarket.df)
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(smarket.df))
## [1] "whichAreInDouble: it took me 0s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(smarket.df))
## [1] "whichAreBijection: it took me 0.06s to identify 0 column(s) to drop."
## integer(0)
smarket.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) 
  smarket.df[- bijections_cols] else smarket.df

smarket.df %>% 
   filter_all(all_vars(is.na(.))) 
# make sure no "NA is in the charachter column Direction"
(na.count <- smarket.df %>%
  filter(Direction == "NA") %>%
  nrow)
## [1] 0
stopifnot(na.count != numeric(0))

# make sure all are Either "Up" of #Down"

ups <- smarket.df %>%
  filter(Direction == "Up") %>%
  nrow

downs <- smarket.df %>%
  filter(Direction == "Down") %>%
  nrow

stopifnot(identical(ups+downs, nrow(smarket.df)))

# Since boosting API with 'benoulli' distribution needs 0 and 1 values
# we cannot use facotr, instead we should manually encode them
smarket.df$Direction.01 <- ifelse(smarket.df$Direction == "Up", 1, 0)

smarket.df$Direction <- as.factor(smarket.df$Direction)

str(smarket.df$Direction)
##  Factor w/ 2 levels "Down","Up": 2 2 1 2 2 2 1 2 2 2 ...
# seems like "Up" -> 1 and "Down" -> 2
head(smarket.df$Direction)
## [1] Up   Up   Down Up   Up   Up  
## Levels: Down Up
head(as.integer(smarket.df$Direction))
## [1] 2 2 1 2 2 2
# split to train and test
(n <- nrow(smarket.df))
## [1] 1250
(no.of.train <- round(0.7*n, 3))
## [1] 875
train.idx <- sample(seq_len(n), no.of.train) 
test.idx <- setdiff(seq_len(n), train.idx)

stopifnot(identical(length(train.idx) + length(test.idx), n))

train.df <- smarket.df[train.idx, ]
test.df <- smarket.df[test.idx, ]


# ----------------------Gradiant boosting -----------------------
# Boosting hyperparameters: 
# -------------------------
#   
# Number of trees: The total number of trees in the sequence or ensemble.
# Learning rate (shrinkage): the smaller this value, the more accurate the model can 
# be but also will require more trees in the sequence.
# 
#
# Tree hyperparameters:
# ----------------------  
# 
# Tree depth: Controls the depth of the individual trees. Typical values range from a 
# depth of 3–8 but it is not uncommon to see a tree depth of 1  Note that larger n or p
# training data sets are more tolerable to deeper trees.
# 
# Minimum number of observations in terminal nodes: Controls the complexity of each tree. 
# Typical values range from 5–15 where higher values help prevent a model from learning 
# relationships which might be highly specific to the particular sample selected 
# for a tree (overfitting) but smaller values can help with imbalanced target classes in classification problems.


# "expand.grid" create a Data Frame from All Combinations of Factor Variables
hyper_grid <- expand.grid(
  learning_rate = c(0.3, 0.1, 0.05, 0.01, 0.005), # values between 0.05–0.2 should work across a wide range of problems
  RMSE = NA,
  trees = NA,
  time = NA
)

for(i in seq_len(nrow(hyper_grid))){
  set.seed(123)  
  train_time <- system.time({
    m <- gbm(
      formula = Direction.01 ~ . -Direction ,
      data = train.df,
      distribution = "bernoulli",
      n.trees = 5000, 
      shrinkage = hyper_grid$learning_rate[i], 
      interaction.depth = 3, 
      n.minobsinnode = 10,
      cv.folds = 10 
   )
  })
  
  # add SSE, trees, and training time to results
  hyper_grid$RMSE[i]  <- sqrt(min(m$cv.error))
  hyper_grid$trees[i] <- which.min(m$cv.error)
  hyper_grid$Time[i]  <- train_time[["elapsed"]]

}

# The optimal value for learning rate (i.e shrinkage parameter) 0.3
arrange(hyper_grid, RMSE)
# Next, we’ll set our learning rate at the optimal level (0.3) and tune 
# the tree specific hyperparameters (interaction.depth and n.minobsinnode). 
# Adjusting the tree-specific parameters provides us with an additional 600 reduction in RMSE.

# search grid
hyper_grid <- expand.grid(
  n.trees = 6000,
  shrinkage = 0.3,
  interaction.depth = c(3, 5, 7),
  n.minobsinnode = c(5, 10, 15)
)

model_fit <- function(n.trees, shrinkage, interaction.depth, n.minobsinnode) {
  set.seed(123)
  m <- gbm(
    formula = Direction.01 ~ . -Direction,
    data = train.df,
    distribution = "bernoulli",
    n.trees = n.trees,
    shrinkage = shrinkage,
    interaction.depth = interaction.depth,
    n.minobsinnode = n.minobsinnode,
    cv.folds = 10
  )
  # compute RMSE
  sqrt(min(m$cv.error))
}

# perform search grid with functional programming
hyper_grid$rmse <- purrr::pmap_dbl(
  hyper_grid,
  ~ model_fit(
    n.trees = ..1,
    shrinkage = ..2,
    interaction.depth = ..3,
    n.minobsinnode = ..4
    )
)

# Top row RMSE shows it is reduced to 7.332922e-07 
arrange(hyper_grid, rmse)
# thus we choose values of n.trees -> 6000, interaction.depth -> 3, 
# and n.minobsinnode -> 5 from the top row as the parameters of 
# top model so far.
# Now to improve the model we took our top model’s hyperparameter settings, 

m <- gbm(
  formula = Direction.01 ~ . -Direction,
  data = train.df,
  distribution = "bernoulli",
  n.trees = 10000,
  shrinkage = 0.06,
  interaction.depth = 3,
  n.minobsinnode = 5,
  cv.folds = 10
)
# compute RMSE
sqrt(min(m$cv.error))
## [1] 3.274839e-06
# Seems like no more improvenebt is possible thus the following are best set of 
# hyperparameters :

  # n.trees = 6000,
  # shrinkage = 0.3,
  # interaction.depth = 3,
  # n.minobsinnode = 5,


# Stochastic hyperparameters:
# ---------------------------
# _Subsample rows before creating each tree (available in gbm, h2o, & xgboost)
# _Subsample columns before creating each tree (h2o & xgboost)
# _Subsample columns before considering each split in each tree (h2o & xgboost)
# 
# Generally, aggressive subsampling of rows, such as selecting only 50% or less of 
# the training data, has shown to be beneficial and typical values range between 0.5–0.8.

# Now let's use h2o for grid search to implement a stochastic GBM 
# We use the optimal hyperparameters above and build onto this by assessing a 
# range of values for subsampling rows and columns before each tree is built, 
# and subsampling columns before each split.
# To speed up training we use early stopping for the individual GBM modeling 
# process and also add a stochastic search criteria.

# initial h2o setup
h2o.no_progress()
h2o.init(ip = "localhost", port = 54321, nthreads= -1, max_mem_size = "10g")
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         1 hours 4 minutes 
##     H2O cluster timezone:       America/Toronto 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.28.0.2 
##     H2O cluster version age:    7 months !!! 
##     H2O cluster name:           H2O_started_from_R_shahrdadshadab_mxj391 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   8.55 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 3.6.2 (2019-12-12)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is too old (7 months)!
## Please download and install the latest version from http://h2o.ai/download/
# prostate_path <- system.file("extdata", "prostate.csv", package = "h2o")
# prostate <- h2o.uploadFile(path = prostate_path)
h2o.removeAll()




train_h2o <- as.h2o(train.df)
test_h2o <- as.h2o(test.df)

# note that for classification gbm in h2o we have to use factor response
# quite oposite to R's gbm package that we must use numeric 0 , 1 as response variable
response <- "Direction"
predictors <- setdiff(colnames(train.df), c(response, "Direction.01"))


# refined hyperparameter grid (you must use list)
hyper_grid <- list(
  sample_rate = c(0.5, 0.75, 1),              # row subsampling
  col_sample_rate = c(0.5, 0.75, 1),          # col subsampling for each split
  col_sample_rate_per_tree = c(0.5, 0.75, 1)  # col subsampling for each tree
)

# random grid search strategy (you must use list)
search_criteria <- list(
  strategy = "RandomDiscrete",
  stopping_metric = "mse",
  stopping_tolerance = 0.001,  # stop if improvement is less than 0.05% in over all OOB error
  stopping_rounds = 10,       # over last 10 trees  
  max_runtime_secs = 60*5    # or stop search after 5 minutes  
)


# perform grid search 
grid <- h2o.grid(
  algorithm = "gbm",
  grid_id = "gbm_grid",
  x = predictors, 
  y = response,
  training_frame = train_h2o,
  hyper_params = hyper_grid,
  ntrees = 6000,
  learn_rate = 0.3,
  max_depth = 3,
  min_rows = 5,
  nfolds = 10,
  stopping_rounds = 10,
  stopping_tolerance = 0,
  search_criteria = search_criteria,
  seed = 123
)

# collect the results and sort by our model performance metric of choice
grid_perf <- h2o.getGrid(
  grid_id = "gbm_grid", 
  sort_by = "mse", 
  decreasing = FALSE
)

grid_perf
## H2O Grid Details
## ================
## 
## Grid ID: gbm_grid 
## Used hyper parameters: 
##   -  col_sample_rate 
##   -  col_sample_rate_per_tree 
##   -  sample_rate 
## Number of models: 24 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by increasing mse
##   col_sample_rate col_sample_rate_per_tree sample_rate         model_ids
## 1             0.5                     0.75         0.5 gbm_grid_model_21
## 2             0.5                      1.0        0.75 gbm_grid_model_10
## 3             0.5                      1.0         1.0  gbm_grid_model_3
## 4            0.75                      1.0         1.0 gbm_grid_model_11
## 5             0.5                      1.0         0.5 gbm_grid_model_12
##                     mse
## 1  0.001121594179032706
## 2  0.001125135863063456
## 3 0.0011325615576201725
## 4 0.0011352998866518348
## 5 0.0011383758400595574
## 
## ---
##    col_sample_rate col_sample_rate_per_tree sample_rate         model_ids
## 19            0.75                      0.5         1.0  gbm_grid_model_7
## 20             0.5                      0.5         1.0 gbm_grid_model_20
## 21            0.75                      0.5        0.75 gbm_grid_model_13
## 22             1.0                      0.5         1.0  gbm_grid_model_9
## 23             0.5                      0.5        0.75 gbm_grid_model_15
## 24             1.0                      0.5        0.75 gbm_grid_model_24
##                     mse
## 19 0.003389857735489425
## 20 0.003421288880241799
## 21 0.004013881611597751
## 22 0.004561166107787774
## 23 0.005851238329674766
## 24  0.11648661527907224
# Grab the model_id for the top model, chosen by cross validation error
best_model_id <- grid_perf@model_ids[[1]]
best_model <- h2o.getModel(best_model_id)


# variable importance plot
vip::vip(best_model) 
## Warning: `as.tibble()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

# Now let’s get performance metrics on the best model
h2o.giniCoef(best_model)
## [1] 1
h2o.performance(model = best_model, xval = TRUE)
## H2OBinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 10-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.001121594
## RMSE:  0.03349021
## LogLoss:  0.005421611
## Mean Per-Class Error:  0
## AUC:  1
## AUCPR:  0.2068966
## Gini:  1
## R^2:  0.9954971
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        Down  Up    Error    Rate
## Down    411   0 0.000000  =0/411
## Up        0 464 0.000000  =0/464
## Totals  411 464 0.000000  =0/875
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.990742   1.000000  64
## 2                       max f2  0.990742   1.000000  64
## 3                 max f0point5  0.990742   1.000000  64
## 4                 max accuracy  0.990742   1.000000  64
## 5                max precision  1.000000   1.000000   0
## 6                   max recall  0.990742   1.000000  64
## 7              max specificity  1.000000   1.000000   0
## 8             max absolute_mcc  0.990742   1.000000  64
## 9   max min_per_class_accuracy  0.990742   1.000000  64
## 10 max mean_per_class_accuracy  0.990742   1.000000  64
## 11                     max tns  1.000000 411.000000   0
## 12                     max fns  1.000000  96.000000   0
## 13                     max fps  0.000000 411.000000 399
## 14                     max tps  0.990742 464.000000  64
## 15                     max tnr  1.000000   1.000000   0
## 16                     max fnr  1.000000   0.206897   0
## 17                     max fpr  0.000000   1.000000 399
## 18                     max tpr  0.990742   1.000000  64
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
summary(best_model)
## Model Details:
## ==============
## 
## H2OBinomialModel: gbm
## Model Key:  gbm_grid_model_21 
## Model Summary: 
##   number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1             754                      754               82070         1
##   max_depth mean_depth min_leaves max_leaves mean_leaves
## 1         3    1.91247          2          8     3.92042
## 
## H2OBinomialMetrics: gbm
## ** Reported on training data. **
## 
## MSE:  3.380832e-34
## RMSE:  1.838704e-17
## LogLoss:  1.015061e-18
## Mean Per-Class Error:  0
## AUC:  1
## AUCPR:  0.006465517
## Gini:  1
## R^2:  1
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        Down  Up    Error    Rate
## Down    411   0 0.000000  =0/411
## Up        3 461 0.006466  =3/464
## Totals  414 461 0.003429  =3/875
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  1.000000   1.000000   2
## 2                       max f2  1.000000   1.000000   2
## 3                 max f0point5  1.000000   1.000000   2
## 4                 max accuracy  1.000000   1.000000   2
## 5                max precision  1.000000   1.000000   0
## 6                   max recall  1.000000   1.000000   2
## 7              max specificity  1.000000   1.000000   0
## 8             max absolute_mcc  1.000000   1.000000   2
## 9   max min_per_class_accuracy  1.000000   1.000000   2
## 10 max mean_per_class_accuracy  1.000000   1.000000   2
## 11                     max tns  1.000000 411.000000   0
## 12                     max fns  1.000000   3.000000   0
## 13                     max fps  0.000000 411.000000   4
## 14                     max tps  1.000000 464.000000   2
## 15                     max tnr  1.000000   1.000000   0
## 16                     max fnr  1.000000   0.006466   0
## 17                     max fpr  0.000000   1.000000   4
## 18                     max tpr  1.000000   1.000000   2
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## 
## H2OBinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 10-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
## 
## MSE:  0.001121594
## RMSE:  0.03349021
## LogLoss:  0.005421611
## Mean Per-Class Error:  0
## AUC:  1
## AUCPR:  0.2068966
## Gini:  1
## R^2:  0.9954971
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        Down  Up    Error    Rate
## Down    411   0 0.000000  =0/411
## Up        0 464 0.000000  =0/464
## Totals  411 464 0.000000  =0/875
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.990742   1.000000  64
## 2                       max f2  0.990742   1.000000  64
## 3                 max f0point5  0.990742   1.000000  64
## 4                 max accuracy  0.990742   1.000000  64
## 5                max precision  1.000000   1.000000   0
## 6                   max recall  0.990742   1.000000  64
## 7              max specificity  1.000000   1.000000   0
## 8             max absolute_mcc  0.990742   1.000000  64
## 9   max min_per_class_accuracy  0.990742   1.000000  64
## 10 max mean_per_class_accuracy  0.990742   1.000000  64
## 11                     max tns  1.000000 411.000000   0
## 12                     max fns  1.000000  96.000000   0
## 13                     max fps  0.000000 411.000000 399
## 14                     max tps  0.990742 464.000000  64
## 15                     max tnr  1.000000   1.000000   0
## 16                     max fnr  1.000000   0.206897   0
## 17                     max fpr  0.000000   1.000000 399
## 18                     max tpr  0.990742   1.000000  64
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary: 
##                mean         sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid
## accuracy        1.0        0.0        1.0        1.0        1.0        1.0
## auc             1.0        0.0        1.0        1.0        1.0        1.0
## aucpr     0.7599853 0.38075897      0.975 0.97619045  0.9811321  0.9756098
## err             0.0        0.0        0.0        0.0        0.0        0.0
## err_count       0.0        0.0        0.0        0.0        0.0        0.0
##           cv_5_valid cv_6_valid cv_7_valid  cv_8_valid cv_9_valid cv_10_valid
## accuracy         1.0        1.0        1.0         1.0        1.0         1.0
## auc              1.0        1.0        1.0         1.0        1.0         1.0
## aucpr     0.97959185  0.9787234     0.6875 0.071428575  0.9302326 0.044444446
## err              0.0        0.0        0.0         0.0        0.0         0.0
## err_count        0.0        0.0        0.0         0.0        0.0         0.0
## 
## ---
##                    mean          sd    cv_1_valid  cv_2_valid  cv_3_valid
## pr_auc        0.7599853  0.38075897         0.975  0.97619045   0.9811321
## precision           1.0         0.0           1.0         1.0         1.0
## r2           0.99581623 0.013228734           1.0         1.0    0.999996
## recall              1.0         0.0           1.0         1.0         1.0
## rmse        0.010346935 0.032265965 9.8665826E-11 3.373171E-7 9.813164E-4
## specificity         1.0         0.0           1.0         1.0         1.0
##               cv_4_valid cv_5_valid   cv_6_valid   cv_7_valid   cv_8_valid
## pr_auc         0.9756098 0.97959185    0.9787234       0.6875  0.071428575
## precision            1.0        1.0          1.0          1.0          1.0
## r2            0.99999994  0.9581666          1.0          1.0    0.9999999
## recall               1.0        1.0          1.0          1.0          1.0
## rmse        1.3877063E-4 0.10217344 6.9318458E-9 6.078576E-12 1.7548395E-4
## specificity          1.0        1.0          1.0          1.0          1.0
##               cv_9_valid   cv_10_valid
## pr_auc         0.9302326   0.044444446
## precision            1.0           1.0
## r2                   1.0           1.0
## recall               1.0           1.0
## rmse        3.0627751E-9 1.2293739E-14
## specificity          1.0           1.0
## 
## Scoring History: 
##             timestamp          duration number_of_trees training_rmse
## 1 2020-08-21 17:35:15  4 min 49.555 sec               0       0.49908
## 2 2020-08-21 17:35:15  4 min 49.558 sec               1       0.49253
## 3 2020-08-21 17:35:15  4 min 49.560 sec               2       0.35008
## 4 2020-08-21 17:35:15  4 min 49.562 sec               3       0.25795
## 5 2020-08-21 17:35:15  4 min 49.564 sec               4       0.19000
##   training_logloss training_auc training_pr_auc training_lift
## 1          0.69131      0.50000         0.00000       1.00000
## 2          0.67817      0.58774         0.57146       1.11212
## 3          0.42935      0.99870         0.94885       1.88578
## 4          0.29601      0.99852         0.99365       1.88578
## 5          0.20819      0.99971         0.95663       1.88578
##   training_classification_error
## 1                       0.46971
## 2                       0.45829
## 3                       0.00114
## 4                       0.00114
## 5                       0.00114
## 
## ---
##               timestamp          duration number_of_trees training_rmse
## 492 2020-08-21 17:35:19  4 min 53.518 sec             491       0.00000
## 493 2020-08-21 17:35:19  4 min 53.527 sec             492       0.00000
## 494 2020-08-21 17:35:19  4 min 53.535 sec             493       0.00000
## 495 2020-08-21 17:35:19  4 min 53.544 sec             494       0.00000
## 496 2020-08-21 17:35:19  4 min 53.553 sec             495       0.00000
## 497 2020-08-21 17:35:20  4 min 53.726 sec             754       0.00000
##     training_logloss training_auc training_pr_auc training_lift
## 492          0.00000      1.00000         0.00647       1.88578
## 493          0.00000      1.00000         0.00862       1.88578
## 494          0.00000      1.00000         0.00431       1.88578
## 495          0.00000      1.00000         0.00862       1.88578
## 496          0.00000      1.00000         0.00431       1.88578
## 497          0.00000      1.00000         0.00647       1.88578
##     training_classification_error
## 492                       0.00000
## 493                       0.00000
## 494                       0.00000
## 495                       0.00000
## 496                       0.00000
## 497                       0.00000
## 
## Variable Importances: (Extract with `h2o.varimp`) 
## =================================================
## 
## Variable Importances: 
##   variable relative_importance scaled_importance percentage
## 1    Today          252.705246          1.000000   0.947556
## 2        X            3.534384          0.013986   0.013253
## 3   Volume            2.454071          0.009711   0.009202
## 4     Year            2.439800          0.009655   0.009148
## 5     Lag1            2.343328          0.009273   0.008787
## 6     Lag2            1.668701          0.006603   0.006257
## 7     Lag4            1.261423          0.004992   0.004730
## 8     Lag5            0.266561          0.001055   0.001000
## 9     Lag3            0.018097          0.000072   0.000068
# pred <- h2o.predict(object=best_model, newdata=test_h2o)

# predict that a person will make a purchase if estimated probability of purchase > 0.5
pred.boost.test <- predict(best_model, newdata = test_h2o, n.trees = 6000)
pred.boost.test <- as_tibble(pred.boost.test)
# Since we encoded Yes as 1 and No as 0 we have

stopifnot(identical(length(pred.boost.test$predict),length(test.df$Direction)))

#Confusion Matrix:
(confusion_table <- table(pred.boost.test$predict, test.df$Direction))
##       
##        Down  Up
##   Down  191   8
##   Up      0 176
# TN: observes as 0 and predicted as 0 (No of people that predicted Not to make any purchase and they really did not)
true.negative <- confusion_table[1,1]

# TP: observes as 1 and predicted as 1 (No of people that predicted to make a purchase and in reality they did)
true.positive <- confusion_table[2,2]

# FP: observes as 0 and predicted as 1 (No of people that predicted to make a purchase but they did not)
false.positive <- confusion_table[2,1]

# FN: observed as 1 and predicted as 0 (No of people that predicted they did not make a purchase but they in realty did)
false.negative <- confusion_table[1,2]

observations.total <- true.negative + false.negative + true.positive + false.positive
observed.as.0 <- true.negative + false.positive
observed.as.1 <- false.negative + true.positive

# Random guessing Rate
(nullClassifier <- max(observed.as.0 / observations.total, observed.as.1 / observations.total))
## [1] 0.5093333
# classification Rate:
(classification.rate <- mean(pred.boost.test$predict == test.df$Direction))
## [1] 0.9786667
# Test error Rate:
(missclassificationRate <- 1 - classification.rate)
## [1] 0.02133333
# False Positive Rate (or: Type I error , 1 - specificty):
(FP_rates <- false.positive / observed.as.0)
## [1] 0
# True Positive Rate (or: 1 - Type II error , power , sesetivity , recall):
(TP_rates <- true.positive / observed.as.1)
## [1] 0.9565217
# Precision (Accuracy Rate, 1 - false discovery proportion):
# Model predicted that Two people make a purchase but in reality only one did (50%)
(precisions <- true.positive / (true.positive + false.positive))
## [1] 1
# Specificity
(specificities <-  1 - false.positive/(false.positive + true.negative))
## [1] 1
library(pROC)
roc_obj <- roc(unlist(test.df$Direction), pred.boost.test$Up)
## Setting levels: control = Down, case = Up
## Setting direction: controls < cases
# Let's draw some AUC plots
plot(roc_obj, legacy.axes = TRUE)

library(tidyverse)       # for data wrangling
library(doParallel)  # for parallel backend to foreach
library(foreach)     # for parallel processing with for loops
library(dataPreparation)
# Modeling packages
library(caret)       # for general model fitting
library(rpart)       # for fitting decision trees
library(ipred)       # for fitting bagged decision trees


options(warn = 1)
set.seed(1113)


smarket.df = 
  read.csv("/Users/shahrdadshadab/env/my-R-project/ISLR/Data/Smarket.csv", 
           header=T, stringsAsFactors = F, na.strings = "?")

str(smarket.df)
## 'data.frame':    1250 obs. of  10 variables:
##  $ X        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Year     : int  2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
##  $ Lag1     : num  0.381 0.959 1.032 -0.623 0.614 ...
##  $ Lag2     : num  -0.192 0.381 0.959 1.032 -0.623 ...
##  $ Lag3     : num  -2.624 -0.192 0.381 0.959 1.032 ...
##  $ Lag4     : num  -1.055 -2.624 -0.192 0.381 0.959 ...
##  $ Lag5     : num  5.01 -1.055 -2.624 -0.192 0.381 ...
##  $ Volume   : num  1.19 1.3 1.41 1.28 1.21 ...
##  $ Today    : num  0.959 1.032 -0.623 0.614 0.213 ...
##  $ Direction: chr  "Up" "Up" "Down" "Up" ...
# remove constant variables
constant_cols <- whichAreConstant(smarket.df)
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(smarket.df))
## [1] "whichAreInDouble: it took me 0s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(smarket.df))
## [1] "whichAreBijection: it took me 0.06s to identify 0 column(s) to drop."
## integer(0)
smarket.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) 
  smarket.df[- bijections_cols] else smarket.df

smarket.df %>% 
   filter_all(all_vars(is.na(.))) 
# make sure no "NA is in the charachter column Direction"
(na.count <- smarket.df %>%
  filter(Direction == "NA") %>%
  nrow)
## [1] 0
stopifnot(na.count != numeric(0))

# make sure all are Either "Up" of #Down"

ups <- smarket.df %>%
  filter(Direction == "Up") %>%
  nrow

downs <- smarket.df %>%
  filter(Direction == "Down") %>%
  nrow

stopifnot(identical(ups+downs, nrow(smarket.df)))

smarket.df$Direction <- as.factor(smarket.df$Direction)

str(smarket.df$Direction)
##  Factor w/ 2 levels "Down","Up": 2 2 1 2 2 2 1 2 2 2 ...
# seems like "Up" -> 1 and "Down" -> 2
head(smarket.df$Direction)
## [1] Up   Up   Down Up   Up   Up  
## Levels: Down Up
head(as.integer(smarket.df$Direction))
## [1] 2 2 1 2 2 2
# split to train and test
(n <- nrow(smarket.df))
## [1] 1250
(no.of.train <- round(0.7*n, 3))
## [1] 875
train.idx <- sample(seq_len(n), no.of.train) 
test.idx <- setdiff(seq_len(n), train.idx)

stopifnot(identical(length(train.idx) + length(test.idx), n))

train.df <- smarket.df[train.idx, ]
test.df <- smarket.df[test.idx, ]

# ------------------------------------
# apply bagging 
# we can use OOB error rate (similar to CV) to find the best number of trees asn you see below
bagging.plot.data.OOB <- tibble(OOB.err = NULL, no.of.trees = NULL, train.time = NULL)
for (no.tree in 50:100){

  train_time <- system.time({
    smarket.model <- bagging(
      formula = Direction ~ .,
      data = train.df,
      nbagg = no.tree,  # number of iterations to include in the bagged model (100 bootstrap replications)
      coob = TRUE, # use the OOB error rate
      #  uses rpart::rpart() build deep trees (no pruning) 
      # that require just two observations in a node to split
      control = rpart.control(minsplit = 2, no.of.trees = no.tree, train_time) 
                                                    
    )
  })
  
  bagging.plot.data.OOB <- rbind(bagging.plot.data.OOB , 
                             tibble(OOB.err = smarket.model$err, 
                                    no.of.trees = no.tree, 
                                    train.time = train_time))
}
bagging.plot.data.OOB
# For al the trees Out-of-bag estimate of misclassification error is zero!!
# Let's apply bagging with 10-fold CV to see how well our ensemble will generalize

smarket.model.cv <- train(
  Direction ~ .,
  data = train.df,
  method = "treebag",
  trControl = trainControl(method = "cv", number = 10),
  nbagg = 200,  
  control = rpart.control(minsplit = 2, cp = 0)
)

#----------------- Interpretation of Bagging Model -----------------------
# Bagging improves the prediction accuracy for high variance (and low bias) 
# models at the expense of interpretability and computational speed. However, 
# using various interpretability algorithms such as VIPs and PDPs, 
# we can still make inferences about how our bagged model leverages feature information

# with bagging, since we use many trees built on bootstrapped samples, we are likely 
# to see many more features used for splits. Consequently, we tend to have many 
# more features involved but with lower levels of importance.
vip::vip(smarket.model.cv, num_features = 5)

# Although the averaging effect of bagging diminishes the ability to interpret the 
# final ensemble, PDPs and other interpretability methods help us to 
# interpret any “black box” model. plot below highlights the unique, and sometimes 
# non-linear, non-monotonic relationships that may exist between a feature and response.

# Construct partial dependence plots
p1 <- pdp::partial(
  smarket.model.cv, 
  pred.var = "Today",
  grid.resolution = 20
  ) %>% 
  autoplot()

p2 <- pdp::partial(
  smarket.model.cv, 
  pred.var = "Lag1",
  grid.resolution = 20
  ) %>% 
  autoplot()

gridExtra::grid.arrange(p1, p2, nrow = 1)
## Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
## Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]`
## instead.
## Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
## Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]`
## instead.

# get the final plot and do the prediction
smarket.model <- smarket.model.cv$finalModel


# do the prediction 

pred.bag.test <- predict(smarket.model, newdata = test.df)

stopifnot(identical(length(pred.bag.test),length(test.df$Direction)))

#Confusion Matrix:
(confusion_table <- table(pred.bag.test, test.df$Direction))
##              
## pred.bag.test Down  Up
##          Down  190   0
##          Up      1 184
# TN: observes as 0 and predicted as 0 (No of people that predicted Not to make any purchase and they really did not)
true.negative <- confusion_table[1,1]

# TP: observes as 1 and predicted as 1 (No of people that predicted to make a purchase and in reality they did)
true.positive <- confusion_table[2,2]

# FP: observes as 0 and predicted as 1 (No of people that predicted to make a purchase but they did not)
false.positive <- confusion_table[2,1]

# FN: observed as 1 and predicted as 0 (No of people that predicted they did not make a purchase but they in realty did)
false.negative <- confusion_table[1,2]

observations.total <- true.negative + false.negative + true.positive + false.positive
observed.as.0 <- true.negative + false.positive
observed.as.1 <- false.negative + true.positive

# Random guessing Rate
(nullClassifier <- max(observed.as.0 / observations.total, observed.as.1 / observations.total))
## [1] 0.5093333
# classification Rate:
(classification.rate <- mean(pred.bag.test == test.df$Direction))
## [1] 0.9973333
# Test error Rate:
(missclassificationRate <- 1 - classification.rate)
## [1] 0.002666667
# False Positive Rate (or: Type I error , 1 - specificty):
(FP_rates <- false.positive / observed.as.0)
## [1] 0.005235602
# True Positive Rate (or: 1 - Type II error , power , sesetivity , recall):
(TP_rates <- true.positive / observed.as.1)
## [1] 1
# Precision (Accuracy Rate, 1 - false discovery proportion):
# Model predicted that Two people make a purchase but in reality only one did (50%)
(precisions <- true.positive / (true.positive + false.positive))
## [1] 0.9945946
# Specificity
(specificities <-  1 - false.positive/(false.positive + true.negative))
## [1] 0.9947644
# -------------------- Parallelizing Bagging -----------------------------
# Since bagging is essentially paralle algorithm we can use some parallelism here

# Create a parallel socket cluster
# cl <- makeCluster(8) # use 8 workers
# registerDoParallel(cl) # register the parallel backend
# 
# # Fit trees in parallel and compute predictions on the test set
# predictions <- foreach(
#   icount(160), 
#   .packages = "rpart", 
#   .combine = cbind
#   ) %dopar% {
#     # bootstrap copy of training data
#     index <- sample(nrow(train.df), replace = TRUE)
#     train_boot <- train.df[index, ]  
#   
#     # fit tree to bootstrap copy
#     bagged_tree <- rpart(
#       Direction ~ ., 
#       control = rpart.control(minsplit = 2, cp = 0),
#       data = train_boot
#       ) 
#     
#     predict(bagged_tree, newdata = test.df)
# }
# 
# # The result is 375 predictions (each for one row in test.df) 
# # performed by 160 trees (2*160columns , one for "Up" and one for "Down")
# predictions %>%
#   as.data.frame()
# 
# # We need to calculate RMSE
# predictions %>%
#   as.data.frame %>%
#   pmap(~sum(c(...))) %>%
#   unlist() %>%
#   tibble(Ups = . , Downs = ncol(predictions) - .) %>%
#   mutate(
#     observation = 1:n(),
#     actual = test.df$Direction)
#   
# stopCluster(cl)
library(tidyverse)       # for data wrangling
library(doParallel)  # for parallel backend to foreach
library(dataPreparation)
# Modeling packages
library(ranger)   # a c++ implementation of random forest 
## 
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
## 
##     importance
library(h2o)      # a java-based implementation of random forest


options(warn = 1)
options(warn = 1)
set.seed(1113)


smarket.df = 
  read.csv("/Users/shahrdadshadab/env/my-R-project/ISLR/Data/Smarket.csv", 
           header=T, stringsAsFactors = F, na.strings = "?")

str(smarket.df)
## 'data.frame':    1250 obs. of  10 variables:
##  $ X        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Year     : int  2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
##  $ Lag1     : num  0.381 0.959 1.032 -0.623 0.614 ...
##  $ Lag2     : num  -0.192 0.381 0.959 1.032 -0.623 ...
##  $ Lag3     : num  -2.624 -0.192 0.381 0.959 1.032 ...
##  $ Lag4     : num  -1.055 -2.624 -0.192 0.381 0.959 ...
##  $ Lag5     : num  5.01 -1.055 -2.624 -0.192 0.381 ...
##  $ Volume   : num  1.19 1.3 1.41 1.28 1.21 ...
##  $ Today    : num  0.959 1.032 -0.623 0.614 0.213 ...
##  $ Direction: chr  "Up" "Up" "Down" "Up" ...
# remove constant variables
constant_cols <- whichAreConstant(smarket.df)
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(smarket.df))
## [1] "whichAreInDouble: it took me 0s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(smarket.df))
## [1] "whichAreBijection: it took me 0.05s to identify 0 column(s) to drop."
## integer(0)
smarket.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) 
  smarket.df[- bijections_cols] else smarket.df

smarket.df %>% 
   filter_all(all_vars(is.na(.))) 
# make sure no "NA is in the charachter column Direction"
na.count <- smarket.df %>%
  filter(Direction == "NA") %>%
  nrow

stopifnot(na.count != numeric(0))

# make sure all are Either "Up" of #Down"

ups <- smarket.df %>%
  filter(Direction == "Up") %>%
  nrow

downs <- smarket.df %>%
  filter(Direction == "Down") %>%
  nrow

stopifnot(identical(ups+downs, nrow(smarket.df)))

smarket.df$Direction <- as.factor(smarket.df$Direction)

str(smarket.df$Direction)
##  Factor w/ 2 levels "Down","Up": 2 2 1 2 2 2 1 2 2 2 ...
# seems like "Up" -> 1 and "Down" -> 2
head(smarket.df$Direction)
## [1] Up   Up   Down Up   Up   Up  
## Levels: Down Up
head(as.integer(smarket.df$Direction))
## [1] 2 2 1 2 2 2
# split to train and test
n <- nrow(smarket.df)
no.of.train <- round(0.7*n, 3)
train.idx <- sample(seq_len(n), no.of.train) 
test.idx <- setdiff(seq_len(n), train.idx)

stopifnot(identical(length(train.idx) + length(test.idx), n))

train.df <- smarket.df[train.idx, ]
test.df <- smarket.df[test.idx, ]


# ----------------------------------------------
# --------------- Random forest ----------------

# By default, ranger sets the mtry parameter to  floor(sqrt(number of features)) 
# however, for regression problems the preferred mtry to start with isfloor(number of features/3). 

# We also set respect.unordered.factors = "order". This specifies how to treat 
# unordered factor variables and we recommend setting this to “order”. 

# number of features
n_features <- length(setdiff(names(train.df), "Direction"))

# train a default random forest model
smarket_rf1 <- ranger(
  Direction ~ ., 
  data = train.df,
  mtry = floor(sqrt(n_features)),
  respect.unordered.factors = "order",
  seed = 123
)

# get OOB RMSE
(default_rmse <- sqrt(smarket_rf1$prediction.error))
## [1] 0
# The main hyperparameters to consider include:
# ----------------------------------------------
# _The number of trees in the forest
# _The number of features to consider at any given split:  mtry (have the largest impact on predictive accuracy)
# _The complexity of each tree (node size: one for classification and five for regression, 
#   max depth, max number of terminal nodes, or the required node size to allow additional splits)
# _The sampling scheme (by default each bootstrap copy has the same size as the original training data)
#  if you have categories that are not balanced, sampling without replacement provides 
#  a less biased use of all levels across the trees in the random forest.
# _The splitting rule to use during tree construction (used primarily to increase computational efficiency)
#  To increase computational efficiency, splitting rules can be randomized where only a random subset of 
#  possible splitting values is considered for a variable. 
#  If only a single random splitting value is randomly selected then we call this procedure extremely randomized trees. 
#  Due to the added #  randomness of split points, this method tends to have no improvement, 
# or often a negative impact, on predictive accuracy.

# Let's do Cartesian grid searches where we assess every combination of hyperparameters of interest:
hyper_grid <- expand.grid(
  mtry = floor(n_features * c(.05, .15, .25, .333, .4)),
  min.node.size = c(1, 3, 5, 10), 
  replace = c(TRUE, FALSE),                               
  sample.fraction = c(.5, .63, .8),                       
  rmse = NA                                               
)

# execute full cartesian grid search
for(i in seq_len(nrow(hyper_grid))) {
  # fit model for ith hyperparameter combination
  fit <- ranger(
    formula         = Direction ~ ., 
    data            = train.df, 
    num.trees       = n_features * 10,
    mtry            = hyper_grid$mtry[i],
    min.node.size   = hyper_grid$min.node.size[i],
    replace         = hyper_grid$replace[i],
    sample.fraction = hyper_grid$sample.fraction[i],
    verbose         = FALSE,
    seed            = 123,
    respect.unordered.factors = 'order',
  )
  # export OOB error 
  
  hyper_grid$rmse[i] <- sqrt(fit$prediction.error)
}

# assess top 10 models:
hyper_grid %>%
  arrange(rmse) %>%
  mutate(perc_gain = (default_rmse - rmse) / default_rmse * 100) %>%
  head(10)
# seems like sampleing only 50% of train.df with replacements consistently performs the best.
# Also mtry = 2 and min.node.size = 1 are good choices

# ------------------- Fit Random forest with h2o ---------------------
h2o.no_progress()
h2o.init(ip = "localhost", port = 54321, nthreads= -1, max_mem_size = "10g")
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         1 hours 11 minutes 
##     H2O cluster timezone:       America/Toronto 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.28.0.2 
##     H2O cluster version age:    7 months !!! 
##     H2O cluster name:           H2O_started_from_R_shahrdadshadab_mxj391 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   8.12 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 3.6.2 (2019-12-12)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is too old (7 months)!
## Please download and install the latest version from http://h2o.ai/download/
h2o.removeAll()

h2o.train <- as.h2o(train.df)
h2o.test <- as.h2o(test.df)
response <- "Direction"
predictors <- setdiff(colnames(train.df), response)

# fit default randomforest model with h2o
h2o.smarket_rf1 <- h2o.randomForest(
  x = predictors,
  y = response,
  training_frame = h2o.train,
  ntrees = n_features*10,
  seed = 123
)
h2o.smarket_rf1
## Model Details:
## ==============
## 
## H2OBinomialModel: drf
## Model ID:  DRF_model_R_1598041551527_127807 
## Model Summary: 
##   number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1              90                       90               27537         1
##   max_depth mean_depth min_leaves max_leaves mean_leaves
## 1        17    7.12222          2         81    19.65556
## 
## 
## H2OBinomialMetrics: drf
## ** Reported on training data. **
## ** Metrics reported on Out-Of-Bag training samples **
## 
## MSE:  0.003180651
## RMSE:  0.05639726
## LogLoss:  0.03548078
## Mean Per-Class Error:  0
## AUC:  1
## AUCPR:  0.887931
## Gini:  1
## R^2:  0.9872305
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##        Down  Up    Error    Rate
## Down    411   0 0.000000  =0/411
## Up        0 464 0.000000  =0/464
## Totals  411 464 0.000000  =0/875
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.659042   1.000000 188
## 2                       max f2  0.659042   1.000000 188
## 3                 max f0point5  0.659042   1.000000 188
## 4                 max accuracy  0.659042   1.000000 188
## 5                max precision  1.000000   1.000000   0
## 6                   max recall  0.659042   1.000000 188
## 7              max specificity  1.000000   1.000000   0
## 8             max absolute_mcc  0.659042   1.000000 188
## 9   max min_per_class_accuracy  0.659042   1.000000 188
## 10 max mean_per_class_accuracy  0.659042   1.000000 188
## 11                     max tns  1.000000 411.000000   0
## 12                     max fns  1.000000 412.000000   0
## 13                     max fps  0.000000 411.000000 262
## 14                     max tps  0.659042 464.000000 188
## 15                     max tnr  1.000000   1.000000   0
## 16                     max fnr  1.000000   0.887931   0
## 17                     max fpr  0.000000   1.000000 262
## 18                     max tpr  0.659042   1.000000 188
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
# Let's perform a larger search with h2o

hyper_grid <- list(
  mtries = floor(n_features * c(.12,.15,.25,.333,.4)),
  min_rows = c(1,3,5,10),
  max_depth = c(10,20,30),
  sample_rate = c(.55, .632, .70, .80)
)

# random grid search strategy (you must use list)
search_criteria <- list(
  strategy = "RandomDiscrete",
  stopping_metric = "mse",
  stopping_tolerance = 0.001,  # stop if improvement is less than 0.05% in over all OOB error
  stopping_rounds = 10,       # over last 10 trees  
  max_runtime_secs = 60*5    # or stop search after 5 minutes  
)


# perform grid search 
grid <- h2o.grid(
  algorithm = "randomForest",
  grid_id = "rf_random_grid",
  x = predictors, 
  y = response,
  training_frame = h2o.train,
  hyper_params = hyper_grid,
  ntrees = n_features*10,
  stopping_metric = "RMSE",
  stopping_rounds = 10,
  stopping_tolerance = 0.005,
  search_criteria = search_criteria,
  seed = 123
)

# collect the results and sort by our model performance metric of choice
grid_perf <- h2o.getGrid(
  grid_id = "rf_random_grid", 
  sort_by = "mse", 
  decreasing = FALSE
)

# this is the best result:
# max_depth -> 20,  min_rows -> 1.0,    mtries -> 3, sample_rate -> 0.7 achieved at RMSE of 0.0028
grid_perf
## H2O Grid Details
## ================
## 
## Grid ID: rf_random_grid 
## Used hyper parameters: 
##   -  max_depth 
##   -  min_rows 
##   -  mtries 
##   -  sample_rate 
## Number of models: 240 
## Number of failed models: 0 
## 
## Hyper-Parameter Search Summary: ordered by increasing mse
##   max_depth min_rows mtries sample_rate                model_ids
## 1        20      1.0      3         0.7  rf_random_grid_model_53
## 2        30      1.0      3         0.7  rf_random_grid_model_77
## 3        20      1.0      3         0.8 rf_random_grid_model_195
## 4        30      1.0      3         0.8 rf_random_grid_model_215
## 5        10      1.0      3         0.7 rf_random_grid_model_100
##                     mse
## 1 0.0028904270252804376
## 2 0.0028904270252804376
## 3 0.0028944841220222553
## 4 0.0028944841220222553
## 5 0.0029727719123189143
## 
## ---
##     max_depth min_rows mtries sample_rate                model_ids
## 235        30     10.0      1         0.7 rf_random_grid_model_143
## 236        20     10.0      1         0.7 rf_random_grid_model_206
## 237        30     10.0      1         0.7 rf_random_grid_model_208
## 238        20     10.0      1         0.7 rf_random_grid_model_240
## 239        10     10.0      1         0.7 rf_random_grid_model_136
## 240        10     10.0      1         0.7  rf_random_grid_model_31
##                     mse
## 235 0.07957467846168671
## 236 0.07957467846168671
## 237 0.07957467846168671
## 238 0.07957467846168671
## 239 0.07997928810857083
## 240 0.07997928810857083
# permutation-based Feature Importance measure: 
# ----------------------------------------------  
#   In the permutation-based approach, for each tree, the OOB sample is passed down 
#   the tree and the prediction accuracy is recorded. Then the values for each variable 
#   (one at a time) are randomly permuted and the accuracy is again computed. 
#   The decrease in accuracy as a result of this randomly shuffling of feature values 
#   is averaged over all the trees for each predictor. 
#   The variables with the largest average decrease in accuracy are considered most important.

# For ranger, once you’ve identified the optimal parameter values from the grid search, 
# you will want to re-run your model with these hyperparameter values. 
# You can also crank up the number of trees, which will help create more stables 
# values of variable importance.

# re-run model with impurity-based variable importance
rf_impurity <- ranger(
  formula = Direction ~ ., 
  data = train.df, 
  num.trees = 2000,
  mtry = 3,
  min.node.size = 1, # i.e. min_rows
  sample.fraction = .80,
  replace = TRUE,
  importance = "impurity",
  respect.unordered.factors = "order",
  verbose = FALSE,
  seed  = 123
)

# re-run model with permutation-based variable importance
rf_permutation <- ranger(
  formula = Direction ~ ., 
  data = train.df, 
  num.trees = 2000,
  mtry = 3,
  min.node.size = 1, # i.e. min_rows
  sample.fraction = .80,
  replace = TRUE,
  importance = "permutation",
  respect.unordered.factors = "order",
  verbose = FALSE,
  seed  = 123
)

# variable importance plot
vip::vip(rf_impurity) 

vip::vip(rf_permutation) 

# seems like Today is most important feature in both plots
library(tidyverse)
(t <- tibble(x = c("", " NA", "12", "34", " NA", "NA", NA), y = c(NA, "qsfg", "fbf", "12", "NA", "NA ", NA)))
(na.idx <- 
  t %>%
  pmap(~c(...)) %>%
  map(~sum(is.na(.) | str_trim(.) == "NA") > 0) %>%
  unlist %>%
  which()
)
## [1] 1 2 5 6 7
rest.idx <- setdiff(seq_len(nrow(t)) , na.idx)

t[na.idx, ]
t[rest.idx, ]
# ------------------------
# library("lme4")
# library("glmmADMB")      ## (not on CRAN)
# library("glmmTMB")
# library("MCMCglmm")
# library("blme")
# library("MASS")          ## for glmmPQL (base R)
# library("nlme")          ## for intervals(), tundra example (base R)
# ## auxiliary
# library("ggplot2")       ## for pretty plots generally
# ## ggplot customization:
# theme_set(theme_bw())
# scale_colour_discrete <- function(...,palette="Set1") {
#     scale_colour_brewer(...,palette=palette)
# }
# scale_colour_orig <- ggplot2::scale_colour_discrete
# scale_fill_discrete <- function(...,palette="Set1") {
#     scale_fill_brewer(...,palette=palette)
# }
# ## to squash facets together ...
# zmargin <- theme(panel.spacing=grid::unit(0,"lines"))
# library("gridExtra")     ## for grid.arrange()
# library("broom.mixed")
# ## n.b. as of 25 Sep 2018, need bbolker github version of dotwhisker ...
# library("dotwhisker")
# library("coda")      ## MCMC diagnostics
# library("aods3")     ## overdispersion diagnostics
# library("plotMCMC") ## pretty plots from MCMC fits
# library("bbmle")     ## AICtab
# library("pbkrtest")  ## parametric bootstrap
# library("Hmisc")
# ## for general-purpose reshaping and data manipulation:
# library("reshape2")
# library("plyr")
# ## for illustrating effects of observation-level variance in binary data:
# library("numDeriv")
# library("glmmADMB")
# bb <- glmmADMB:::get_bin_loc()[["bin_loc"]]
# bpath <- gsub("glmmadmb$","",bb)
# file.copy(bb,paste0(bpath,"glmmadmb.bak"))
# bburl <- "http://admb-project.org/buildbot/glmmadmb/"
# download.file(paste0(bburl,
#    "glmmadmb-mingw64-r2885-windows8-mingw64.exe"), dest=bb)